def test_posting_content(self):
        """
        Tests the group variations
        """
        # Duplicates groups are are removed automatically
        article = NNTPArticle(
            subject='woo-hoo',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # First we create a 512K file
        tmp_file = join(
            self.tmp_dir, 'NNTPArticle_Test.posting', 'file.tmp')

        # File should not already exist
        assert(isfile(tmp_file) is False)
        # Create a random file
        assert(self.touch(tmp_file, size='512K', random=True) is True)
        # File should exist now
        assert(isfile(tmp_file) is True)

        # Now we want to load it into a NNTPContent object
        content = NNTPBinaryContent(filepath=tmp_file, work_dir=self.tmp_dir)
        assert(article.add(content) is True)

        # Now we want to split the file up
        results = article.split('128K')
        # Tests that our results are expected
        assert(isinstance(results, sortedset) is True)
        assert(len(results) == 4)
Beispiel #2
0
    def test_nzbfile_generation(self):
        """
        Tests the creation of NZB Files
        """
        nzbfile = join(self.tmp_dir, 'test.nzbfile.nzb')
        payload = join(self.var_dir, 'uudecoded.tax.jpg')
        assert isfile(nzbfile) is False
        # Create our NZB Object
        nzbobj = NNTPnzb()

        # create a fake article
        segpost = NNTPSegmentedPost(basename(payload))
        content = NNTPBinaryContent(payload)

        article = NNTPArticle('testfile', groups='newsreap.is.awesome')

        # Note that our nzb object segment tracker is not marked as being
        # complete. This flag gets toggled when we add segments manually to
        # our nzb object or if we parse an NZB-File
        assert(nzbobj._segments_loaded is None)

        # Add our Content to the article
        article.add(content)
        # now add our article to the NZBFile
        segpost.add(article)
        # now add our Segmented Post to the NZBFile
        nzbobj.add(segpost)

        # Since .add() was called, this will be set to True now
        assert(nzbobj._segments_loaded is True)

        # Store our file
        assert nzbobj.save(nzbfile) is True
        assert isfile(nzbfile) is True
    def test_post_iter(self):
        """
        Tests that we can correctly iterate over our content for posting
        purposes.

        """
        # Prepare Article
        article = NNTPArticle(
            subject='',
            poster='',
            body='hello world',
            work_dir=self.tmp_dir,
        )

        # we failed because our subject and poster was blank
        # we also fail because we have no groups defined
        assert(article.post_iter() is None)

        article.groups.add('alt.binaries.test')
        assert(article.post_iter() is None)

        article.subject = 'Subject'
        assert(article.post_iter() is None)

        article.poster = '*****@*****.**'

        # Now we're good to go
        it = article.post_iter()
        assert(it is not None)
        for entry in it:
            assert(isinstance(entry, basestring) is True)
    def test_general_features(self):
        """
        NNTPSegmentedPost manage a list of NNTPArticles

        Test the basic funtionality of the object

        """
        # create an object
        segobj = NNTPSegmentedPost('mytestfile')

        # Not valid because there are no entries
        assert segobj.is_valid() is False
        article = NNTPArticle(work_dir=self.tmp_dir)

        assert(segobj.add(article) is True)
        assert(len(segobj) == 1)

        # Not valid because the entry added is not loaded or retrieved
        assert(segobj.is_valid() is False)

        # Duplicates are ignored (we can't add the same file twice)
        assert(segobj.add(article) is False)
        assert(len(segobj) == 1)

        # We can't add other types
        assert(segobj.add(None) is False)
        assert(segobj.add("bad bad") is False)
        assert(segobj.add(1) is False)
        assert(len(segobj) == 1)

        # Test iterations
        for a in segobj:
            assert isinstance(a, NNTPArticle)
    def test_article_copy(self):
        """
        The copy() function built into the article allows you
        to create a duplicate copy of the original article without
        obstructing the content from within.
        """

        tmp_dir = join(self.tmp_dir, 'NNTPArticle_Test.test_article_copy')
        # First we create a 512K file
        tmp_file_01 = join(tmp_dir, 'file01.tmp')
        tmp_file_02 = join(tmp_dir, 'file02.tmp')

        # Allow our files to exist
        assert(self.touch(tmp_file_01, size='512K', random=True) is True)
        assert(self.touch(tmp_file_02, size='512K', random=True) is True)

        # Duplicates groups are are removed automatically
        article = NNTPArticle(
            subject='woo-hoo',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Store some content
        content = NNTPBinaryContent(
            filepath=tmp_file_01, part=1, work_dir=self.tmp_dir)
        assert(article.add(content) is True)
        content = NNTPBinaryContent(
            filepath=tmp_file_02, part=2, work_dir=self.tmp_dir)
        assert(article.add(content) is True)

        # Detect our 2 articles
        assert(len(article) == 2)

        # Set a few header entries
        article.header['Test'] = 'test'
        article.header['Another-Entry'] = 'test2'

        # Create a copy of our object
        article_copy = article.copy()

        assert(len(article_copy) == len(article))
        assert(len(article_copy.header) == len(article.header))

        # Make sure that if we obstruct 1 object it doesn't
        # effect the other (hence we should have a pointer to
        # the same location in memory
        article.header['Yet-Another-Entry'] = 'test3'
        assert(len(article_copy.header)+1 == len(article.header))
    def test_msgid(self):
        """
        Tests that we can generate message id's when we need to

        """
        # Prepare Article
        article = NNTPArticle(work_dir=self.tmp_dir)

        # We equal a blank
        assert(article.id == '')

        # Store our new identifier (our Message-ID)
        new_id = article.msgid()

        # We now have a set id
        assert(article.id == new_id)

        # Consecutive calls do not change the value
        assert(article.msgid() == new_id)

        # However they do change if we put a reset in it
        another_id = article.msgid(reset=True)

        # We're no longer using the old ID
        assert(article.id != new_id)
        assert(another_id != new_id)

        # We are using the new id
        assert(article.msgid() == another_id)

        # This is also what we're set to now
        assert(article.id == another_id)
Beispiel #7
0
    def test_yenc_v1_3_NNTPArticle_encode_01(self):
        """
        Test the encoding of data; this is nessisary prior to a post
        """

        # A simple test for ensuring that the yEnc
        # library exists; otherwise we want this test
        # to fail; the below line will handle this for
        # us; we'll let the test fail on an import error
        import yenc

        # First we take a binary file
        binary_filepath = join(self.var_dir, 'joystick.jpg')
        assert isfile(binary_filepath)

        # Initialize Codec
        encoder = CodecYenc(work_dir=self.test_dir)

        # Create an NNTPArticle Object
        article = NNTPArticle()
        # Add our file
        article.add(binary_filepath)

        # Encode our article by object
        new_article_a = article.encode(encoder)

        # We should have gotten an NNTPArticle Object
        assert isinstance(new_article_a, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article_a) > 0

        # Encode our article by type
        new_article_b = article.encode(CodecYenc)

        # We should have gotten an NNTPArticle Object
        assert isinstance(new_article_b, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article_b) > 0

        # Our article should be the same when it was generated by both
        # methods
        assert new_article_a[0].md5() == new_article_b[0].md5()

        # Chain our encodings
        new_article = article.encode(
            [CodecYenc, CodecYenc(work_dir=self.test_dir)], )

        # We should have gotten an ASCII Content Object
        assert isinstance(new_article, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article) > 0
Beispiel #8
0
    def test_yenc_v1_3_NNTPArticle_encode_01(self):
        """
        Test the encoding of data; this is nessisary prior to a post
        """

        # A simple test for ensuring that the yEnc
        # library exists; otherwise we want this test
        # to fail; the below line will handle this for
        # us; we'll let the test fail on an import error
        import yenc

        # First we take a binary file
        binary_filepath = join(self.var_dir, 'joystick.jpg')
        assert isfile(binary_filepath)

        # Initialize Codec
        encoder = CodecYenc(work_dir=self.test_dir)

        # Create an NNTPArticle Object
        article = NNTPArticle()
        # Add our file
        article.add(binary_filepath)

        # Encode our article by object
        new_article_a = article.encode(encoder)

        # We should have gotten an NNTPArticle Object
        assert isinstance(new_article_a, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article_a) > 0

        # Encode our article by type
        new_article_b = article.encode(CodecYenc)

        # We should have gotten an NNTPArticle Object
        assert isinstance(new_article_b, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article_b) > 0

        # Our article should be the same when it was generated by both
        # methods
        assert new_article_a[0].md5() == new_article_b[0].md5()

        # Chain our encodings
        new_article = article.encode(
            [CodecYenc, CodecYenc(work_dir=self.test_dir)],
        )

        # We should have gotten an ASCII Content Object
        assert isinstance(new_article, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article) > 0
Beispiel #9
0
    def test_NNTPArticle_UU_encode_01(self):
        """
        Test the encoding of data; this is nessisary prior to a post
        """

        # First we take a binary file
        binary_filepath = join(self.var_dir, 'joystick.jpg')
        assert isfile(binary_filepath)

        # Initialize Codec
        encoder = CodecUU(work_dir=self.test_dir)

        # Create an NNTPArticle Object
        article = NNTPArticle()
        # Add our file
        article.add(binary_filepath)

        # Encode our article by object
        new_article_a = article.encode(encoder)

        # We should have gotten an NNTPArticle Object
        assert isinstance(new_article_a, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article_a) > 0

        # Encode our article by type
        new_article_b = article.encode(CodecUU)

        # We should have gotten an NNTPArticle Object
        assert isinstance(new_article_b, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article_b) > 0

        # Our article should be the same when it was generated by both
        # methods
        assert new_article_a[0].md5() == new_article_b[0].md5()

        # Chain our encodings
        new_article = article.encode(
            [CodecUU, CodecUU(work_dir=self.test_dir)],
        )

        # We should have gotten an ASCII Content Object
        assert isinstance(new_article, NNTPArticle) is True

        # We should actually have article associated with out data
        assert len(new_article) > 0
    def test_loading_response(self):
        """
        Tests the load() function of the article
        """

        # Prepare a Response
        response = NNTPResponse(200, 'Great Data')
        response.decoded.add(NNTPBinaryContent(work_dir=self.tmp_dir))

        # Prepare Article
        article = NNTPArticle(id='random-id', work_dir=self.tmp_dir)

        # There is no data so our article can't be valid
        assert(article.is_valid() is False)

        # Load and Check
        assert(article.load(response) is True)
        assert(article.header is None)
        assert(len(article.decoded) == 1)
        assert(len(article.decoded) == len(article.files()))
        assert(str(article) == 'random-id')
        assert(unicode(article) == u'random-id')
        assert(article.size() == 0)

        # Now there is data, but it's an empty Object so it can't be valid
        assert(article.is_valid() is False)

        result = re.search(' Message-ID=\"(?P<id>[^\"]+)\"', repr(article))
        assert(result is not None)
        assert(result.group('id') == str(article))

        result = re.search(' attachments=\"(?P<no>[^\"]+)\"', repr(article))
        assert(result is not None)
        assert(int(result.group('no')) == len(article))

        # Prepare Article
        article_a = NNTPArticle(id='a', work_dir=self.tmp_dir)
        article_b = NNTPArticle(id='b', work_dir=self.tmp_dir)
        assert((article_a < article_b) is True)

        # playing with the sort order however alters things
        article_a.no += 1
        assert((article_a < article_b) is False)

        # Prepare a Response (with a Header)
        response = NNTPResponse(200, 'Great Data')
        response.decoded.add(NNTPHeader(work_dir=self.tmp_dir))
        response.decoded.add(NNTPBinaryContent(work_dir=self.tmp_dir))

        # Prepare Article
        article = NNTPArticle(id='random-id', work_dir=self.tmp_dir)

        # Load and Check
        assert(article.load(response) is True)
        assert(isinstance(article.header, NNTPHeader))
        assert(len(article.decoded) == 1)

        for no, decoded in enumerate(article.decoded):
            # Test equality
            assert(article[no] == decoded)

        # We can also load another article ontop of another
        # This used when associating downloaded articles with ones
        # found in NZB-Files
        new_article = NNTPArticle(
            msgid='brand-new-id',
            no=article.no+1,
            groups='a.b.c,d.e.f',
            work_dir=self.tmp_dir,
        )
        new_article.subject = 'test-subject-l2g'
        new_article.poster = 'test-poster-l2g'
        new_article.header = 'test-header-l2g'

        assert(article.load(new_article) is True)
        assert(article.id == new_article.id)
        assert(article.no == new_article.no)
        assert(article.groups == new_article.groups)
        assert(article.poster == new_article.poster)
        assert(article.subject == new_article.subject)
        assert(article.header == new_article.header)
        assert(article.body == new_article.body)
        assert(article.decoded == new_article.decoded)
        assert(article.groups == new_article.groups)
Beispiel #11
0
    def test_get_paths(self):
        """
        Test that we fail under certain conditions

        """
        # Generate temporary folder to work with
        work_dir = join(self.tmp_dir, 'CodecFile_Test', 'work')

        # Initialize Codec (without volume_size disables it)
        cr = CodecFile(work_dir=work_dir)

        # create some dummy file entries
        tmp_files = set()
        for i in range(0, 10):
            # Create some temporary files to work with in our source
            # directory
            tmp_file = join(work_dir, 'DSC_IMG%.3d.jpeg' % i)
            self.touch(tmp_file, size='120K', random=True)

            # Add a file to our tmp_files list
            tmp_files.add(tmp_file)

        # Non-existant file reference
        invalid_file = join(self.tmp_dir, 'non-existant-file')
        assert isfile(invalid_file) is False

        content = NNTPContent(
            join(work_dir, 'testfile'),
            work_dir=self.tmp_dir,
        )
        content.write('test data')

        # Empty NNTPArticle() can not be added
        article = NNTPArticle(work_dir=self.tmp_dir)

        # New Empty NNTPContent() can not be added
        article_content = NNTPContent(
            join(work_dir, 'testfile2'),
            work_dir=self.tmp_dir,
        )
        # Store some new data
        article_content.write('some more test data')

        # We'll add our new content to our article
        assert article.add(content) is True

        # save path
        sub_dir = join(work_dir, 'subdir')
        assert mkdir(sub_dir) is True
        assert isdir(sub_dir) is True

        # string work
        assert len(cr.get_paths(self.tmp_dir)) == 1
        assert cr.get_paths(self.tmp_dir).pop() == self.tmp_dir

        # Sub-directories that exist within a root directory also included are
        # removed
        assert len(cr.get_paths([self.tmp_dir, sub_dir])) == 1
        assert cr.get_paths([self.tmp_dir, sub_dir]).pop() == self.tmp_dir

        # Invalid files/dirs are not found
        assert len(cr.get_paths(invalid_file)) == 0

        # Create a list of many assorted type of items
        __set = set([
            work_dir,
            sub_dir,
            article_content,
            content,
            invalid_file,
        ]) | set(tmp_files)

        # At the end of the day, the work_dir includes all of the sub-content
        # and the invalid_file is simply just tossed. However because our
        # NNTPContent() and NNTPArticle() files are stored outside of our
        # work_dir, they will also be included in the results
        results = cr.get_paths(__set)
        assert len(results) == 3
        assert work_dir in results
        assert content.filepath in results
        assert article_content.filepath in results

        # Now if we did the same test but without the work_dir directory, then
        # we'd have a much larger list; we'll work with lists this time to
        # show that we support them too
        __list = [
            sub_dir,
            article_content,
            content,
            invalid_file,
        ]
        __list.extend(tmp_files)

        results = cr.get_paths(__list)
        # +1 for content
        # +1 for sub_dir
        assert len(results) == (len(tmp_files) + len(article) + 2)
        for f in tmp_files:
            # Each file in our tmp_files will be in our results
            assert f in results
        assert content.filepath in results
        assert article_content.filepath in results
        assert sub_dir in results
Beispiel #12
0
    def next(self):
        """
        Python 2 support
        Support stream type functions and iterations
        """

        # We track our iterator since we move along if our mode tells us to do
        # so.
        _iter = None

        if self.xml_root is not None:
            # clear our unused memory
            self.xml_root.clear()

        if self._segment_iter:
            while 1:
                _iter = self._segment_iter.next()
                if self._valid_by_mode(_iter):
                    return _iter

        # get the root element
        try:
            _, self.xml_root = self.xml_iter.next()

            # Increment our iterator
            self.xml_itr_count += 1

        except StopIteration:
            # let this pass through
            self.xml_root = None
            self.xml_itr_count = 0

        except IOError:
            logger.warning('NZB-File is missing: %s' % self.filepath)
            self.xml_root = None
            self.xml_itr_count = 0
            # Mark situation
            self._lazy_is_valid = False

        except XMLSyntaxError as e:
            if e[0] is not None:
                # We have corruption
                logger.error("NZB-File '%s' is corrupt" % self.filepath)
                logger.debug('NZB-File XMLSyntaxError Exception %s' % str(e))
                # Mark situation
                self._lazy_is_valid = False
            # else:
            # this is a bug with lxml in earlier versions
            # https://bugs.launchpad.net/lxml/+bug/1185701
            # It occurs when the end of the file is reached and lxml
            # simply just doesn't handle the closure properly
            # it was fixed here:
            # https://github.com/lxml/lxml/commit\
            #       /19f0a477c935b402c93395f8c0cb561646f4bdc3
            # So we can relax and return ok results here
            self.xml_root = None
            self.xml_itr_count = 0

        except Exception as e:
            logger.error("NZB-File '%s' is corrupt" % self.filepath)
            logger.debug('NZB-File Exception %s' % str(e))
            # Mark situation
            self._lazy_is_valid = False

        if self.xml_root is None or len(self.xml_root) == 0:
            self.xml_iter = None
            self.xml_root = None
            self.xml_itr_count = 0

            raise StopIteration()

        if self.meta is None:
            # Attempt to populate meta information
            self.meta = {}

            for meta in self.xml_root.xpath('/ns:nzb/ns:head[1]/ns:meta',
                                            namespaces=NZB_LXML_NAMESPACES):
                # Store the Meta Information Detected
                self.meta[meta.attrib['type'].decode(self.encoding)] = \
                    self.unescape_xml(meta.text.strip())

        # Acquire the Segments Groups
        groups = [
            group.text.strip().decode(self.encoding)
            for group in self.xml_root.xpath(
                'ns:groups/ns:group',
                namespaces=NZB_LXML_NAMESPACES,
            )
        ]

        # The detected filename
        _filename = ''

        # The name from the meta tag
        _name = self.meta.get('name', '').decode(self.encoding).strip()

        if not _name and self.filepath is not None:
            # Lets try to generate a name frome our NZB-File
            tmpfname = basename(self.filepath)

            # Strip our extension off the end (if present)
            result = NZB_EXTENSION_RE(tmpfname)
            if result and result.group('fname'):
                # Store our new filename as our name
                _name = result.group('fname')

        # Subject
        _subject = self.unescape_xml(
                self.xml_root.attrib.get('subject', '')).decode(self.encoding)

        # Poster
        _poster = self.unescape_xml(
                self.xml_root.attrib.get('poster', '')).decode(self.encoding)

        # Use our Codec(s) to extract our Yenc Subject
        matched = None
        for c in self._codecs:
            # for each entry, parse our article
            matched = c.parse_article(
                subject=_subject,
                poster=_poster,
            )
            if matched:
                # We matched
                break

        if matched:
            # We succesfully got a filename from our subject line
            _filename = matched.get('fname', '').strip()
            if _filename and _name:
                # always allow the name to over-ride the detected filename if
                # we actually have a real name we can assciated with it by
                _ext = self._mime.extension_from_filename(_filename)
                if _ext:
                    _filename = '{0}{1}'.format(_name, _ext)

        # Initialize a NNTPSegmented File Object using the data we read
        _file = NNTPSegmentedPost(
            _filename,
            poster=_poster,
            epoch=self.xml_root.attrib.get('date', '0'),
            subject=_subject,
            groups=groups,
            work_dir=self.work_dir,
            sort_no=self.xml_itr_count,
        )

        # index tracker
        _last_index = 0

        # Now append our segments
        for segment in self.xml_root.xpath(
                'ns:segments/ns:segment', namespaces=NZB_LXML_NAMESPACES):

            _cur_index = int(segment.attrib.get('number', _last_index+1))
            try:
                _size = int(segment.attrib.get('bytes'))
                if _size < 0:
                    _size = 0

            except (TypeError, ValueError):
                _size = 0

            article = NNTPArticle(
                subject=_file.subject,
                poster=_file.poster,
                id=self.unescape_xml(segment.text),
                no=_cur_index,
                work_dir=self.work_dir,
                codecs=self._codecs,
            )

            # Store our empty content Placeholder
            article.add(
                NNTPEmptyContent(
                    filepath=_filename,
                    part=self.xml_itr_count,
                    total_size=_size,
                    work_dir=self.work_dir,
                )
            )

            # Add article
            _file.add(article)

            # Track our index
            _last_index = _cur_index

        if not self._valid_by_mode(_file):
            # Not used; recursively move along
            return self.next()

        # Return our object
        return _file
Beispiel #13
0
    def test_rar_errors(self):
        """
        Test that we fail under certain conditions

        """
        # Generate temporary folder to work with
        work_dir = join(self.tmp_dir, 'CodecRar_Test.rar.fail', 'work')

        # Now we want to prepare a folder filled with temporary content
        # Note: this directory is horrible because it's 'within' our work_dir
        # as a result, adding content should not succeed
        source_dir = join(work_dir, 'test')

        # Initialize Codec (without volume_size disables it)
        cr = CodecRar(work_dir=work_dir)

        # No files
        assert len(cr) == 0

        tmp_file = join(source_dir, 'temp_file_non-existant')
        assert isfile(tmp_file) is False
        # We can't add content that does not exist
        assert cr.add(tmp_file) is False

        # Still No files
        assert len(cr) == 0

        # However directories can not cross into our work directory
        tmp_dir = dirname(work_dir)

        # We intentionally pick a directory that has the work_dir
        # as a child within it
        assert isdir(tmp_dir)

        # Denied adding the file because it would include the work_dir
        # if we did
        assert cr.add(tmp_file) is False

        # Temporary file (within directory denied in previous check)
        tmp_file = join(tmp_dir, 'temp_file')
        assert isfile(tmp_file) is False

        # Create our temporary file now
        self.touch(tmp_file, size='120K', random=True)
        assert isfile(tmp_file) is True

        # This file is within our work_dir but we're still okay because
        # it's accessing the file explicitly and the fact it's a file
        # and not a directory
        assert cr.add(tmp_file) is True

        # Now we'll have 1 entry in our list
        assert len(cr) == 1

        # You can't add duplicates
        assert cr.add(tmp_file) is False

        # We still have 1 entry
        assert len(cr) == 1

        # Empty NNTPContent() can not be added
        content = NNTPContent(unique=True, work_dir=self.tmp_dir)

        # Can't do it
        assert cr.add(content) is False

        # Store some data
        content.write('some data\r\n')

        # Now we can add it because it has data in it
        assert cr.add(content) is True

        # We now have 2 entries
        assert len(cr) == 2

        # We can't add duplicates
        assert cr.add(content) is False

        # We still have 2 entries
        assert len(cr) == 2

        # Empty NNTPArticle() can not be added
        article = NNTPArticle(work_dir=self.tmp_dir)

        # Can't do it
        assert cr.add(article) is False

        # If we add content that's already been added, nothing
        # new will happen either
        assert article.add(content) is True

        # Still can't do (only because it was already added)
        assert cr.add(article) is False

        # We still have 2 entries
        assert len(cr) == 2

        # New Empty NNTPContent() can not be added
        content = NNTPContent(unique=True, work_dir=self.tmp_dir)

        # We'll add our new content to our article
        assert article.add(content) is True

        # Our new content has no data associated with it, so this should
        # still fail
        assert cr.add(article) is False

        # We still have 2 entries
        assert len(cr) == 2

        # Store some new data
        content.write('some new data\r\n')

        # Our new content within our article now has data so this will work
        assert cr.add(article) is True

        # We now have 3 entries
        assert len(cr) == 3
Beispiel #14
0
    def next(self):
        """
        Python 2 support
        Support stream type functions and iterations
        """

        # We track our iterator since we move along if our mode tells us to do
        # so.
        _iter = None

        if self.xml_root is not None:
            # clear our unused memory
            self.xml_root.clear()

        if self._segment_iter:
            while 1:
                _iter = self._segment_iter.next()
                if self._valid_by_mode(_iter):
                    return _iter

        # get the root element
        try:
            _, self.xml_root = self.xml_iter.next()

            # Increment our iterator
            self.xml_itr_count += 1

        except StopIteration:
            # let this pass through
            self.xml_root = None
            self.xml_itr_count = 0

        except IOError:
            logger.warning('NZB-File is missing: %s' % self.filepath)
            self.xml_root = None
            self.xml_itr_count = 0
            # Mark situation
            self._lazy_is_valid = False

        except XMLSyntaxError as e:
            if e[0] is not None:
                # We have corruption
                logger.error("NZB-File '%s' is corrupt" % self.filepath)
                logger.debug('NZB-File XMLSyntaxError Exception %s' % str(e))
                # Mark situation
                self._lazy_is_valid = False
            # else:
            # this is a bug with lxml in earlier versions
            # https://bugs.launchpad.net/lxml/+bug/1185701
            # It occurs when the end of the file is reached and lxml
            # simply just doesn't handle the closure properly
            # it was fixed here:
            # https://github.com/lxml/lxml/commit\
            #       /19f0a477c935b402c93395f8c0cb561646f4bdc3
            # So we can relax and return ok results here
            self.xml_root = None
            self.xml_itr_count = 0

        except Exception as e:
            logger.error("NZB-File '%s' is corrupt" % self.filepath)
            logger.debug('NZB-File Exception %s' % str(e))
            # Mark situation
            self._lazy_is_valid = False

        if self.xml_root is None or len(self.xml_root) == 0:
            self.xml_iter = None
            self.xml_root = None
            self.xml_itr_count = 0

            raise StopIteration()

        if self.meta is None:
            # Attempt to populate meta information
            self.meta = {}

            for meta in self.xml_root.xpath('/ns:nzb/ns:head[1]/ns:meta',
                                            namespaces=NZB_LXML_NAMESPACES):
                # Store the Meta Information Detected
                self.meta[meta.attrib['type'].decode(self.encoding)] = \
                    self.unescape_xml(meta.text.strip())

        # Acquire the Segments Groups
        groups = [
            group.text.strip().decode(self.encoding)
            for group in self.xml_root.xpath(
                'ns:groups/ns:group',
                namespaces=NZB_LXML_NAMESPACES,
            )
        ]

        # The detected filename
        _filename = ''

        # The name from the meta tag
        _name = self.meta.get('name', '').decode(self.encoding).strip()

        if not _name and self.filepath is not None:
            # Lets try to generate a name frome our NZB-File
            tmpfname = basename(self.filepath)

            # Strip our extension off the end (if present)
            result = NZB_EXTENSION_RE.search(tmpfname)
            if result and result.group('fname'):
                # Store our new filename as our name
                _name = result.group('fname')

        # Subject
        _subject = self.unescape_xml(self.xml_root.attrib.get(
            'subject', '')).decode(self.encoding)

        # Poster
        _poster = self.unescape_xml(self.xml_root.attrib.get(
            'poster', '')).decode(self.encoding)

        # Use our Codec(s) to extract our Yenc Subject
        matched = None
        for c in self._codecs:
            # for each entry, parse our article
            matched = c.parse_article(
                subject=_subject,
                poster=_poster,
            )
            if matched:
                # We matched
                break

        if matched:
            # We succesfully got a filename from our subject line
            _filename = matched.get('fname', '').strip()
            if _filename and _name:
                # always allow the name to over-ride the detected filename if
                # we actually have a real name we can assciated with it by
                _ext = self._mime.extension_from_filename(_filename)
                if _ext:
                    _filename = '{0}{1}'.format(_name, _ext)

        # Initialize a NNTPSegmented File Object using the data we read
        _file = NNTPSegmentedPost(
            _filename,
            poster=_poster,
            epoch=self.xml_root.attrib.get('date', '0'),
            subject=_subject,
            groups=groups,
            work_dir=self.work_dir,
            sort_no=self.xml_itr_count,
        )

        # index tracker
        _last_index = 0

        # Now append our segments
        for segment in self.xml_root.xpath('ns:segments/ns:segment',
                                           namespaces=NZB_LXML_NAMESPACES):

            _cur_index = int(segment.attrib.get('number', _last_index + 1))
            try:
                _size = int(segment.attrib.get('bytes'))
                if _size < 0:
                    _size = 0

            except (TypeError, ValueError):
                _size = 0

            article = NNTPArticle(
                subject=_file.subject,
                poster=_file.poster,
                id=self.unescape_xml(segment.text),
                no=_cur_index,
                work_dir=self.work_dir,
                codecs=self._codecs,
            )

            # Store our empty content Placeholder
            article.add(
                NNTPEmptyContent(
                    filepath=_filename,
                    part=self.xml_itr_count,
                    total_size=_size,
                    work_dir=self.work_dir,
                ))

            # Add article
            _file.add(article)

            # Track our index
            _last_index = _cur_index

        if not self._valid_by_mode(_file):
            # Not used; recursively move along
            return self.next()

        # Return our object
        return _file
Beispiel #15
0
    def test_7z_errors(self):
        """
        Test that we fail under certain conditions

        """
        # Generate temporary folder to work with
        work_dir = join(self.tmp_dir, 'Codec7Zip_Test.7z.fail', 'work')

        # Now we want to prepare a folder filled with temporary content
        # Note: this directory is horrible because it's 'within' our work_dir
        # as a result, adding content should not succeed
        source_dir = join(work_dir, 'test')

        # Initialize Codec (without volume_size disables it)
        cr = Codec7Zip(work_dir=work_dir)

        # No files
        assert len(cr) == 0

        tmp_file = join(source_dir, 'temp_file_non-existant')
        assert isfile(tmp_file) is False
        # We can't add content that does not exist
        assert cr.add(tmp_file) is False

        # Still No files
        assert len(cr) == 0

        # However directories can not cross into our work directory
        tmp_dir = dirname(work_dir)

        # We intentionally pick a directory that has the work_dir
        # as a child within it
        assert isdir(tmp_dir)

        # Denied adding the file because it would include the work_dir
        # if we did
        assert cr.add(tmp_file) is False

        # Temporary file (within directory denied in previous check)
        tmp_file = join(tmp_dir, 'temp_file')
        assert isfile(tmp_file) is False

        # Create our temporary file now
        self.touch(tmp_file, size='120K', random=True)
        assert isfile(tmp_file) is True

        # This file is within our work_dir but we're still okay because
        # it's accessing the file explicitly and the fact it's a file
        # and not a directory
        assert cr.add(tmp_file) is True

        # Now we'll have 1 entry in our list
        assert len(cr) == 1

        # You can't add duplicates
        assert cr.add(tmp_file) is False

        # We still have 1 entry
        assert len(cr) == 1

        # Empty NNTPContent() can not be added
        content = NNTPContent(unique=True, work_dir=self.tmp_dir)

        # Can't do it
        assert cr.add(content) is False

        # Store some data
        content.write('some data\r\n')

        # Now we can add it because it has data in it
        assert cr.add(content) is True

        # We now have 2 entries
        assert len(cr) == 2

        # We can't add duplicates
        assert cr.add(content) is False

        # We still have 2 entries
        assert len(cr) == 2

        # Empty NNTPArticle() can not be added
        article = NNTPArticle(work_dir=self.tmp_dir)

        # Can't do it
        assert cr.add(article) is False

        # If we add content that's already been added, nothing
        # new will happen either
        assert article.add(content) is True

        # Still can't do (only because it was already added)
        assert cr.add(article) is False

        # We still have 2 entries
        assert len(cr) == 2

        # New Empty NNTPContent() can not be added
        content = NNTPContent(unique=True, work_dir=self.tmp_dir)

        # We'll add our new content to our article
        assert article.add(content) is True

        # Our new content has no data associated with it, so this should
        # still fail
        assert cr.add(article) is False

        # We still have 2 entries
        assert len(cr) == 2

        # Store some new data
        content.write('some new data\r\n')

        # Our new content within our article now has data so this will work
        assert cr.add(article) is True

        # We now have 3 entries
        assert len(cr) == 3
Beispiel #16
0
def search(ctx, group, keywords, minscore, maxscore, case_insensitive, nzb):
    """
    Searches cached groups for articles.

    Specified keywords stack on one another.  Each keyword specified must
    match somewhere in the subject line or else the result is filtered.

    Keywords can also be prefixed with special characters too to help
    identify what is being scanned.

        1. Example 1: A search that should ignore any text with 'Test' in it
                    but include text with 'Jack' in it. Unless you include
                    the case-insensitive switch (inspired from grep), the
                    search will be case sensitive:

                    -Test +Jack

        The + (plus) is always implied. It's primary use it to eliminate
        abiguity (and allow for the minus to exist).  It is also nessisary if
        you intend to search for something with a plus in it, hence the
        following would search for the string '+++AWESOME+++':

                    +++++AWESOME+++

        The extra plus symbol is stripped off and the search works as intended.

        2.  Example 2: Search by Poster.  Since all keywords imply that you're
                 searching for a subject keyword, the next token that changes
                 this is '%p' where as the subject is always implied
                 identified as '%s'.  Hence the following would look for me:

                    %pChris %pl2g

            This can also be written like this:

                    %p+Chris %p+l2g

            You should not be confused here, the tokens at the front will be
            stripped off and the search will run as normal. These tokens are
            very important because it allows you to mix and match search with
            both the subject and poster:

                    %p+Chris %p+l2g AWESOME

            The above implies that AWESOME will have a +%s infront of it.
            Make sense?

        The final thing worth noting is doing a search for text that contains
        dash/minus (-) signs.  Click (the awesome cli wrapper this script
        uses can pick the - up as an actual switch thinking you're trying to
        pass it into this function. So you can easily disable this with by
        adding a double dash/minus sign (--) like so:

            nr search -- -keyword +keyword2

    """

    session = ctx['NNTPSettings'].session()
    if not session:
        logger.error("The database is not correctly configured.")
        exit(1)

    if not group:
        logger.error("You must specify a group/alias.")
        exit(1)

    # Simplify Alias
    groups = get_groups(session, group)
    if not groups:
        logger.error("You must specify a group/alias.")
        exit(1)

    for name, _id in groups.iteritems():
        db_path = join(ctx['NNTPSettings'].work_dir, 'cache', 'search')
        db_file = '%s%s' % (
            join(db_path, name),
            SQLITE_DATABASE_EXTENSION,
        )
        if not isfile(db_file):
            logger.warning("There is no cached content for '%s'." % db_file)
            continue

        reset = not exists(db_file)

        engine = 'sqlite:///%s' % db_file
        db = NNTPGroupDatabase(engine=engine, reset=reset)
        group_session = db.session()
        if not group_session:
            logger.warning("The database %s not be accessed." % db_file)
            continue

        gt = group_session.query(Article)

        # Parse our keywords
        parsed_keywords = parse_search_keyword(keywords)
        for _op, _cat, keyword in parsed_keywords:

            if _cat == SearchCategory.SUBJECT:
                if _op == SearchOperation.INCLUDE:
                    if case_insensitive:
                        logger.debug(
                            'Scanning -and- (case-insensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            Article.subject.ilike('%%%s%%' % keyword))
                    else:
                        logger.debug(
                            'Scanning -and- (case-sensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(Article.subject.like('%%%s%%' %
                                                            keyword))
                else:
                    # _op == SearchCategory.EXCLUDE
                    if case_insensitive:
                        logger.debug(
                            'Scanning -not- (case-insensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.subject.ilike('%%%s%%' % keyword)))
                    else:
                        logger.debug(
                            'Scanning -and not- (case-sensitive) subject: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.subject.like('%%%s%%' % keyword)))

            elif _cat == SearchCategory.POSTER:
                if _op == SearchOperation.INCLUDE:
                    if case_insensitive:
                        logger.debug(
                            'Scanning -and- (case-insensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(Article.poster.ilike('%%%s%%' %
                                                            keyword))
                    else:
                        logger.debug('Scanning -and- (case-sensitive) poster: '
                                     '"%s"' % (keyword))
                        gt = gt.filter(Article.poster.like('%%%s%%' % keyword))

                else:
                    # _op == SearchCategory.EXCLUDE
                    if case_insensitive:
                        logger.debug(
                            'Scanning -and not- (case-insensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.poster.ilike('%%%s%%' % keyword)))
                    else:
                        logger.debug(
                            'Scanning -and not- (case-sensitive) poster: '
                            '"%s"' % (keyword))
                        gt = gt.filter(
                            not_(Article.poster.like('%%%s%%' % keyword)))

        # Handle Scores
        if maxscore == minscore:
            logger.debug('Scanning -score == %d-' % (maxscore))
            gt = gt.filter(Article.score == maxscore)

        else:
            logger.debug('Scanning -score >= %d and score <= %d-' %
                         (minscore, maxscore))

            gt = gt.filter(Article.score <= maxscore)\
                   .filter(Article.score >= minscore)

        gt = gt.order_by(Article.score.desc())

        group_session.close()
        db.close()

        if nzb:
            # make an NZB file from our results
            nzb_file = '_'.join(keywords) + '.nzb'
            nzb = NNTPnzb(
                nzb_file,
                work_dir='/tmp/',
            )

            # Iterate through our list
            for entry in gt:
                # create segment/article for each result
                segment = NNTPSegmentedPost('',
                                            poster=entry.poster,
                                            subject=entry.subject,
                                            utc=entry.posted_date,
                                            groups=name)
                article = NNTPArticle(str(entry.message_id))

                # add empty content placeholder to article
                article.add(NNTPEmptyContent('', total_size=entry.size))

                # Add Article to Segment to NZB
                segment.add(article)
                nzb.add(segment)

            # save NZB file
            nzb.save()

        else:
            # Iterate through our list
            print("%s:" % (name))
            for entry in gt:
                print("  [%s] %.4d %s" %
                      (entry.message_id, entry.score,
                       (entry.subject).encode('ascii', 'ignore')))

    return
    def test_group(self):
        """
        Tests the group variations
        """

        # Test String
        article = NNTPArticle(
            id='random-id',
            work_dir=self.tmp_dir,
        )
        assert(isinstance(article.groups, set))
        assert(len(article.groups) == 0)

        # Test String
        article = NNTPArticle(
            id='random-id',
            groups='convert.lead.2.gold',
            work_dir=self.tmp_dir,
        )
        assert(isinstance(article.groups, set))
        assert(len(article.groups) == 1)
        assert('convert.lead.2.gold' in article.groups)

        # Support Tuples
        article = NNTPArticle(
            id='random-id',
            groups=(
                'convert.lead.2.gold',
                'convert.lead.2.gold.again',
            ),
            work_dir=self.tmp_dir,
        )

        assert(isinstance(article.groups, set))
        assert(len(article.groups) == 2)
        assert('convert.lead.2.gold' in article.groups)
        assert('convert.lead.2.gold.again' in article.groups)

        # Support Lists
        article = NNTPArticle(
            id='random-id',
            groups=[
                'convert.lead.2.gold',
                'convert.lead.2.gold.again',
            ],
            work_dir=self.tmp_dir,
        )
        assert(isinstance(article.groups, set))
        assert(len(article.groups) == 2)
        assert('convert.lead.2.gold' in article.groups)
        assert('convert.lead.2.gold.again' in article.groups)

        # Support Sets
        article = NNTPArticle(
            id='random-id',
            groups=set([
                'convert.lead.2.gold',
                'convert.lead.2.gold.again',
            ]),
            work_dir=self.tmp_dir,
        )
        assert(isinstance(article.groups, set))
        assert(len(article.groups) == 2)
        assert('convert.lead.2.gold' in article.groups)
        assert('convert.lead.2.gold.again' in article.groups)

        # Don't expect invalid groups to stick
        article = NNTPArticle(
            id='random-id',
            groups=4,
            work_dir=self.tmp_dir,
        )
        assert(len(article.groups) == 0)

        # Duplicates groups are are removed automatically
        article = NNTPArticle(
            id='random-id',
            groups=[
                'convert.lead.2.gold.again',
                'ConVert.lead.2.gold',
                'convert.lead.2.gold',
                'convert.lead.2.gold.again',
            ],
            work_dir=self.tmp_dir,
        )
        assert(isinstance(article.groups, set))
        assert(len(article.groups) == 2)
        assert('convert.lead.2.gold' in article.groups)
        assert('convert.lead.2.gold.again' in article.groups)
    def test_deobsfucation(self):
        """
        Tests deobsfucation functionality
        """

        tmp_dir = join(self.tmp_dir, 'NNTPArticle_Test.deobsfucation')

        # First we create a 512K file
        tmp_file = join(tmp_dir, 'file.tmp')
        rar_file = join(tmp_dir, 'file.rar')

        # Allow our files to exist
        assert(self.touch(tmp_file, size='512K', random=True) is True)
        assert(self.touch(rar_file, size='512K', random=True) is True)

        # Create an article that we'll store our rar file into; but we
        # intentionally want to give our rarfile a different name then what
        # is defined above
        article = NNTPArticle(
            subject='"my test file" - testfile.rar yEnc (1/1)',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Add our Rar File
        article.add(rar_file)

        # the attachment name takes priority over the detected article name
        assert(article.deobsfucate() == 'file.rar')

        # filebase allows us to enforce what the filename will be once we
        # figure out the extension
        assert(article.deobsfucate(filebase="mytest") == 'mytest.rar')

        # Adding a second file adds ambiguity, this will fail
        article.add(tmp_file)
        assert(article.deobsfucate() is None)

        # Create another article; but this time we'll associate our temporary
        # file to it. Since our temporary file has a useless extension
        # we will test that the article parsing takes over a bigger role in
        # the detection process.
        article = NNTPArticle(
            subject='"my test file" - testfile.jpeg yEnc (1/1)',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Add our temporary file with a bad extension (.tmp is useless to us)
        article.add(tmp_file)

        # the article takes priority over the detected attachment
        assert(article.deobsfucate() == 'testfile.jpeg')

        # None is a perfectly accepted argument and won't cause any issues
        assert(article.deobsfucate(filebase=None) == 'testfile.jpeg')

        # If codecs are set to None, then the default codecs are used
        assert(article.deobsfucate(codecs=None) == 'testfile.jpeg')

        # If codecs are set to to an empty list, then you're effectively
        # telling the tool to 'not' parse the article at all so our
        # attachment is used instead
        assert(article.deobsfucate(codecs=[]) == 'file.tmp')

        # a file base with codecs disabled still alows our base to prevail
        assert(article.deobsfucate(filebase="abcd", codecs=[]) == 'abcd.tmp')

        # filebase allows us to enforce what the filename will be once we
        # figure out the extension. Our article extension takes over
        assert(article.deobsfucate(filebase="mytest") == 'mytest.jpeg')

        # Now another thing that can happen is that our Article is not
        # parseable but our decoded file is:
        # Create an article that we'll store our rar file into; but we
        # intentionally want to give our rarfile a different name then what
        # is defined above
        article = NNTPArticle(
            subject='"a garbage unparseable subject',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Add our Rar File
        article.add(rar_file)

        # the attachment name takes priority
        assert(article.deobsfucate() == 'file.rar')

        # Another thing that can happen is that neither the attachment or the
        # article is parseable
        article = NNTPArticle(
            subject='"a garbage unparseable subject',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Add our garbage .tmp file
        article.add(tmp_file)

        # unparseable everything just returns out attachment filename
        assert(article.deobsfucate() == 'file.tmp')

        # Another thing that can happen is that the subject identifies one
        # type of file, however our attachment identifies another.
        article = NNTPArticle(
            subject='"my greatest picture" - l2g.png yEnc (1/1)',
            poster='<*****@*****.**>',
            id='random-id',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Add our Rar File (even though we're looking for a picture)
        article.add(rar_file)

        # the attachment name takes priority over the detected article name
        # when 2 mime types collide
        assert(article.deobsfucate() == 'file.rar')
    def test_article_splitting(self):
        """
        Tests that articles can split
        """
        # Duplicates groups are are removed automatically
        article = NNTPArticle(
            work_dir=self.tmp_dir,
            subject='split-test',
            poster='<*****@*****.**>',
            groups='alt.binaries.l2g',
        )

        # Nothing to split gives an error
        assert(article.split() is None)

        tmp_file = join(self.tmp_dir, 'NNTPArticle_Test.chunk', '1MB.rar')
        # The file doesn't exist at first
        assert(isfile(tmp_file) is False)
        # Create it
        assert(self.touch(tmp_file, size='1MB', random=True) is True)
        # Now it does
        assert(isfile(tmp_file) is True)

        # Now we want to load it into a NNTPContent object
        content = NNTPBinaryContent(filepath=tmp_file, work_dir=self.tmp_dir)

        # Add our object to our article
        assert(article.add(content) is True)

        # No size to split on gives an error
        assert(article.split(size=0) is None)
        assert(article.split(size=-1) is None)
        assert(article.split(size=None) is None)
        assert(article.split(size='bad_string') is None)

        # Invalid Memory Limit
        assert(article.split(mem_buf=0) is None)
        assert(article.split(mem_buf=-1) is None)
        assert(article.split(mem_buf=None) is None)
        assert(article.split(mem_buf='bad_string') is None)

        # We'll split it in 2
        results = article.split(strsize_to_bytes('512K'))

        # Tests that our results are expected
        assert(isinstance(results, sortedset) is True)
        assert(len(results) == 2)

        # Test that the parts were assigned correctly
        for i, article in enumerate(results):
            # We should only have one content object
            assert(isinstance(article, NNTPArticle) is True)
            assert(len(article) == 1)
            # Our content object should correctly have the part and
            # total part contents populated correctly
            assert(article[0].part == (i+1))
            assert(article[0].total_parts == len(results))
    def test_article_append(self):
        """
        Test article append()

        Appending effectively takes another's article and appends it's
        content to the end of the article doing the appending.
        Consider:
            - test.rar.000 (ArticleA)
            - test.rar.001 (ArticleB)
            - test.rar.002 (Articlec)

            # The following would assemble the entire article
            ArticleA.append(ArticleB)
            ArticleA.append(ArticleC)

        """
        # Create a temporary file we can use
        tmp_file = join(self.tmp_dir, 'NNTPArticle_Test.append', '1MB.rar')

        # The file doesn't exist at first
        assert(not isfile(tmp_file))

        # Create it
        assert(self.touch(tmp_file, size='1MB', random=True))

        # Now it does
        assert(isfile(tmp_file))

        # Duplicates groups are are removed automatically
        article_a = NNTPArticle(
            work_dir=self.tmp_dir,
            subject='split-test-a',
            poster='<*****@*****.**>',
            groups='alt.binaries.l2g',
        )

        # No size at this point
        assert(article_a.size() == 0)

        # Add our file to our article
        assert(article_a.add(tmp_file) is True)

        # We should be equal to the size we created our content with
        assert(article_a.size() == strsize_to_bytes('1M'))

        # We'll split it in 2
        results = article_a.split(strsize_to_bytes('512K'))

        # Size doesn't change even if we're split
        assert(article_a.size() == strsize_to_bytes('1M'))

        # Tests that our results are expected
        assert(isinstance(results, sortedset) is True)
        assert(len(results) == 2)

        # We'll create another article
        article_b = NNTPArticle(
            subject='split-test-b',
            poster='<*****@*****.**>',
            groups='alt.binaries.l2g',
            work_dir=self.tmp_dir,
        )

        # Now we'll join the contents using append
        assert(article_b.size() == 0)
        for article in results:
            assert(isinstance(article, NNTPArticle) is True)
            assert(article_b.append(article) is True)

        assert(article_b.size() == article_a.size())
        assert(article_b[0].md5() == article_a[0].md5())