def compute(file_obj):
    """Compute a metadata-invariant fingerprint from an MP3 file.

    Args:
      file_obj: A file-like object

    Returns:
      A string containing the fingerprint, which takes the form of
      a 40-character sequence of hex digits.  If no valid MPEG frames
      are found, None is returned.
    """
    sha1_calc = hashlib.sha1()
    saw_a_valid_frame = False
    for hdr, data_buffer in mp3_frame.split(file_obj):
        if hdr is not None:
            sha1_calc.update(data_buffer)
            saw_a_valid_frame = True
    if saw_a_valid_frame:
        return sha1_calc.hexdigest()
    else:
        return None
Beispiel #2
0
def analyze(file_obj, au_file, compute_fingerprint=True, get_payload=True):
    """Populate an AudioFile object with information extracted from a file.

    Args:
      file_obj: A file-like object.
      au_file: An AudioFile object to store the results of the analysis in.
      compute_fingerprint: If False, do not compute a fingerprint.

    Returns:
      The same AudioFile object that was passed in as au_file, which
      should now have several fields set.

    Raises:
      InvalidFileError: if the file appears to be corrupted.
    """
    au_file.frame_count = 0
    au_file.frame_size = 0
    au_file.duration_ms = 0
    sha1_calc = hashlib.sha1()  # unused if compute_fingerprint is False.
    payload = cStringIO.StringIO()  # unused if get_payload is False.

    bit_rate_kbps_sum = 0
    expected_hdr = None
    first_bit_rate_kbps = None
    is_vbr = False

    for hdr, data_buffer in mp3_frame.split(file_obj):
        if hdr is None:
            continue

        au_file.frame_count += 1
        au_file.frame_size += len(data_buffer)
        au_file.duration_ms += hdr.duration_ms
        if compute_fingerprint:
            sha1_calc.update(data_buffer)
        if get_payload:
            payload.write(data_buffer)

        # If we've seen a valid header previously, make sure that all of the
        # fields that should match do actually match.
        if expected_hdr:
            if not hdr.match(expected_hdr):
                raise InvalidFileError(
                    "Bad header: found %s, expected %s (path=%s)" %
                    (hdr, expected_hdr, au_file.path))
            # Keep track of if this is a variable bit-rate file.
            if hdr.bit_rate_kbps != first_bit_rate_kbps:
                is_vbr = True

        # Add this frame's bit rate to our sum; we will use this to compute
        # the average bit rate.
        bit_rate_kbps_sum += hdr.bit_rate_kbps

        # If this is the first header we've seen, make a copy and then blank
        # out the fields that can vary.  All future headers are expected to
        # match this template.
        if expected_hdr is None:
            expected_hdr = hdr
            first_bit_rate_kbps = expected_hdr.bit_rate_kbps
            expected_hdr.bit_rate_kbps = None  # Might be a VBR file.
            expected_hdr.padding = None  # Not all frames are padded.
            expected_hdr.frame_size = None
            # You'd think that this would be constant, but MP3s
            # encountered in the wild prove otherwise.
            expected_hdr.protected = None

    if au_file.frame_count < _MINIMUM_FRAMES:
        raise InvalidFileError("Found only %d MPEG frames" %
                               au_file.frame_count)

    # Add the bit rate back into the template header, then return it.
    # If this is a VBR file, use the average bit rate instead.
    if is_vbr:
        expected_hdr.bit_rate_kbps = (float(bit_rate_kbps_sum) /
                                      au_file.frame_count)
    else:
        expected_hdr.bit_rate_kbps = first_bit_rate_kbps

    # Finishing populating and then return the AudioFile object.
    au_file.mp3_header = expected_hdr
    # Round the duration down to an integral number of microseconds.
    au_file.duration_ms = int(au_file.duration_ms)
    if compute_fingerprint:
        au_file.fingerprint = sha1_calc.hexdigest()
    if get_payload:
        au_file.payload = payload.getvalue()
    return au_file
    def test_split(self):
        raw_hdr, hdr = mp3_header_test.VALID_MP3_HEADERS.items()[0]
        frame_data = raw_hdr.ljust(hdr.frame_size, "a")
        # Set up a fragment of a header
        partial_header = raw_hdr[:3]
        short_frame = frame_data[:25]
        assert len(short_frame) < len(frame_data)

        id3_data = id3_header.create_test_header(77).ljust(77, "b")

        # An ID3 tag with a valid frame tag stashed inside.
        evil_id3_data = id3_header.create_test_header(50) + raw_hdr
        evil_id3_data = evil_id3_data.ljust(50, "c")

        for seq in (
            [ frame_data ],
            [ frame_data, frame_data ],
            [ 'junk', frame_data ],
            [ 'junk', frame_data, frame_data ],
            [ 'junk', frame_data, frame_data, 'junk' ],
            [ 'junk', frame_data, frame_data, 'junk', frame_data ],
            # Check handling of truncated headers and frames.
            [ partial_header ],
            [ 'junk', partial_header ],
            [ 'junk', short_frame ],
            [ frame_data, partial_header ],
            [ frame_data, short_frame ],
            [ frame_data, 'junk', short_frame ],
            [ frame_data, 'junk', partial_header],
            # ID3 headers mixed in
            [ id3_data, frame_data ],
            [ frame_data, id3_data ],
            [ id3_data, frame_data ],
            [ id3_data, frame_data, id3_data ],
            [ evil_id3_data, frame_data, "junk" ],
            [ "junk", frame_data, evil_id3_data, frame_data ],
            [ "junk", frame_data, evil_id3_data, frame_data, "junk" ],
            [ "junk" + evil_id3_data, id3_data, frame_data, evil_id3_data ],
            # Some longer sequences
            500 * [ frame_data ],
            500 * [ "junk", frame_data, id3_data, frame_data ]
            ):
            data = ''.join(seq)
            stream = cStringIO.StringIO(data)
            split_stream = list(mp3_frame.split(stream))
            split_stream_from_blocks = list(mp3_frame.split_blocks(iter(seq)))
            split_stream_from_one_block = mp3_frame.split_one_block(data)
            # Make sure that the sequences of header/frame data pairs
            # returned by mp3_frame.split(), mp3_frame.split_blocks()
            # and mp3_frame.split_one_block() matche what we would
            # expect.
            self.assertEqual(len(seq), len(split_stream))
            for expected_data, (actual_hdr, data) in zip(seq, split_stream):
                self.assertEqual(expected_data, data)
                if expected_data == frame_data:
                    self.assertTrue(actual_hdr is not None)
                    self.assertTrue(actual_hdr.match(hdr))
                    self.assertEqual(hdr.frame_size, len(frame_data))
                else:
                    self.assertTrue(actual_hdr is None)

            self.assertEqual(len(seq), len(split_stream_from_blocks))
            for (hdr1, data1), (hdr2, data2) in zip(split_stream,
                                                    split_stream_from_blocks):
                self.assertEqual(str(hdr1), str(hdr2))
                self.assertEqual(data1, data2)

            self.assertEqual(len(seq), len(split_stream_from_one_block))
            for (hdr1, data1), (hdr2, data2) in zip(
                split_stream, split_stream_from_one_block):
                self.assertEqual(str(hdr1), str(hdr2))
                self.assertEqual(data1, data2)
Beispiel #4
0
def analyze(file_obj, au_file, compute_fingerprint=True, get_payload=True):
    """Populate an AudioFile object with information extracted from a file.

    Args:
      file_obj: A file-like object.
      au_file: An AudioFile object to store the results of the analysis in.
      compute_fingerprint: If False, do not compute a fingerprint.

    Returns:
      The same AudioFile object that was passed in as au_file, which
      should now have several fields set.

    Raises:
      InvalidFileError: if the file appears to be corrupted.
    """
    au_file.frame_count = 0
    au_file.frame_size = 0
    au_file.duration_ms = 0
    sha1_calc = hashlib.sha1()  # unused if compute_fingerprint is False.
    payload = cStringIO.StringIO()  # unused if get_payload is False.

    bit_rate_kbps_sum = 0
    expected_hdr = None
    first_bit_rate_kbps = None
    is_vbr = False

    for hdr, data_buffer in mp3_frame.split(file_obj):
        if hdr is None:
            continue

        au_file.frame_count += 1
        au_file.frame_size += len(data_buffer)
        au_file.duration_ms += hdr.duration_ms
        if compute_fingerprint:
            sha1_calc.update(data_buffer)
        if get_payload:
            payload.write(data_buffer)

        # If we've seen a valid header previously, make sure that all of the
        # fields that should match do actually match.
        if expected_hdr:
            if not hdr.match(expected_hdr):
                raise InvalidFileError(
                    "Bad header: found %s, expected %s (path=%s)" % (
                        hdr, expected_hdr, au_file.path))
            # Keep track of if this is a variable bit-rate file.
            if hdr.bit_rate_kbps != first_bit_rate_kbps:
                is_vbr = True

        # Add this frame's bit rate to our sum; we will use this to compute
        # the average bit rate.
        bit_rate_kbps_sum += hdr.bit_rate_kbps

        # If this is the first header we've seen, make a copy and then blank
        # out the fields that can vary.  All future headers are expected to
        # match this template.
        if expected_hdr is None:
            expected_hdr = hdr
            first_bit_rate_kbps = expected_hdr.bit_rate_kbps
            expected_hdr.bit_rate_kbps = None  # Might be a VBR file.
            expected_hdr.padding = None  # Not all frames are padded.
            expected_hdr.frame_size = None
            # You'd think that this would be constant, but MP3s
            # encountered in the wild prove otherwise.
            expected_hdr.protected = None

    if au_file.frame_count < _MINIMUM_FRAMES:
        raise InvalidFileError("Found only %d MPEG frames"
                               % au_file.frame_count)

    # Add the bit rate back into the template header, then return it.
    # If this is a VBR file, use the average bit rate instead.
    if is_vbr:
        expected_hdr.bit_rate_kbps = (
            float(bit_rate_kbps_sum) / au_file.frame_count)
    else:
        expected_hdr.bit_rate_kbps = first_bit_rate_kbps

    # Finishing populating and then return the AudioFile object.
    au_file.mp3_header = expected_hdr
    # Round the duration down to an integral number of microseconds.
    au_file.duration_ms = int(au_file.duration_ms)
    if compute_fingerprint:
        au_file.fingerprint = sha1_calc.hexdigest()
    if get_payload:
        au_file.payload = payload.getvalue()
    return au_file