def test_getlazy_many_streams_from_txt_with_given_number_of_streams_without_adjust_for_current_cpu_521(
            self):
        self.blogger_corpus()
        end_file_marker = -1
        reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                     self.txt_blogger_small_fake_set),
                        "txt",
                        regex_template="blogger",
                        mode=self.mode,
                        end_file_marker=end_file_marker,
                        send_end_file_marker=True)
        number_of_found_files = reader._get_number_of_left_over_files()
        #p(number_of_found_files)

        # Check for stream_number=3
        len(reader.getlazy(stream_number=3,
                           adjust_to_cpu=False)).should.be.equal(3)
        len([
            rowdict
            for gen in reader.getlazy(stream_number=3, adjust_to_cpu=False)
            for rowdict in gen if end_file_marker == rowdict
        ]).should.be.equal(number_of_found_files)

        # Check for stream_number=2
        #p(reader.getlazy(stream_number=2, adjust_to_cpu=False))
        len(reader.getlazy(stream_number=2,
                           adjust_to_cpu=False)).should.be.equal(2)
        len([
            rowdict
            for gen in reader.getlazy(stream_number=2, adjust_to_cpu=False)
            for rowdict in gen if end_file_marker == rowdict
        ]).should.be.equal(number_of_found_files)

        i = 0
        for gen, fname in zip(
                reader.getlazy(stream_number=3,
                               adjust_to_cpu=False,
                               min_files_pro_stream=1),
                reversed(reader.files_to_read_orig)):
            for row_dict in gen:
                if row_dict == end_file_marker:
                    i += 1
                    continue
                t = codecs.open(fname, "r", encoding="utf-8").read()
                #p((row_dict["text"],t))
                assert row_dict["text"] == t
                assert isinstance(row_dict, dict)
                assert len(row_dict) == 6
                assert 'text' in row_dict
                assert 'star_constellation' in row_dict
                assert 'working_area' in row_dict
                assert 'age' in row_dict
                assert 'id' in row_dict
                assert 'gender' in row_dict
        assert number_of_found_files == i
    def test_getlazy_many_streams_from_txt_without_given_number_of_streams_adjusted_for_current_cpu_520(
            self):
        self.blogger_corpus()
        end_file_marker = -1
        reader = Reader(os.path.join(self.tempdir_blogger_corp,
                                     self.txt_blogger_small_fake_set),
                        "txt",
                        regex_template="blogger",
                        mode=self.mode,
                        end_file_marker=end_file_marker,
                        send_end_file_marker=True)
        number_of_found_files = reader._get_number_of_left_over_files()
        #p(number_of_found_files)
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=3)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=5)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 5))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=3)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=2)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   2, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=1)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   1, number_of_found_files, 4))

        i = 0
        for gen in reader.getlazy(stream_number=1000,
                                  adjust_to_cpu=True,
                                  min_files_pro_stream=1):
            for row_dict in gen:
                if row_dict == end_file_marker:
                    i += 1
                    continue
                assert isinstance(row_dict, dict)
                assert len(row_dict) == 6
                assert 'text' in row_dict
                assert 'star_constellation' in row_dict
                assert 'working_area' in row_dict
                assert 'age' in row_dict
                assert 'id' in row_dict
                assert 'gender' in row_dict
        #p((number_of_found_files, i))
        assert number_of_found_files == i
    def test_getlazy_many_streams_from_json_also_getted_from_zips_519(self):
        self.blogger_corpus()

        # Test 1: Check if number of getted files is correct
        end_file_marker = -1
        reader = Reader(os.path.join(self.tempdir_blogger_corp),
                        "json",
                        mode=self.mode,
                        read_from_zip=True,
                        end_file_marker=end_file_marker,
                        send_end_file_marker=True)
        number_of_found_files = reader._get_number_of_left_over_files()
        if number_of_found_files < 3:
            assert False

        if reader.files_number_in_zips != len(reader.files_to_read_orig):
            ## for this, it is important that the main folder of the test cases should be zipped!! That there is the same number of files
            assert False

        number_getted_files = len([
            row for gen in reader.getlazy(
                stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5)
            for row in gen if row == end_file_marker
        ])
        if number_of_found_files != number_getted_files:
            assert False

        #p((number_of_found_files, number_getted_files), "number_of_found_files != number_getted_files")

        # Test 2: check if right number of streams will be returned
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=5)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   5, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=3)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   3, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=2)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   2, number_of_found_files, 4))
        len(
            reader.getlazy(stream_number=4,
                           adjust_to_cpu=True,
                           min_files_pro_stream=1)).should.be.equal(
                               get_number_of_streams_adjust_cpu(
                                   1, number_of_found_files, 4))

        i = 0
        for gen in reader.getlazy(stream_number=1000,
                                  adjust_to_cpu=True,
                                  min_files_pro_stream=1):
            for row_dict in gen:
                #p(row_dict)
                #i+=1
                if row_dict == end_file_marker:
                    i += 1
                    continue
                assert isinstance(row_dict, dict)
                assert len(row_dict) == 6
                assert 'text' in row_dict
                assert 'star_constellation' in row_dict
                assert 'working_area' in row_dict
                assert 'age' in row_dict
                assert 'id' in row_dict
                assert 'gender' in row_dict
        assert number_of_found_files == i