def test_getlazy_many_streams_from_txt_with_given_number_of_streams_without_adjust_for_current_cpu_521( self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_small_fake_set), "txt", regex_template="blogger", mode=self.mode, end_file_marker=end_file_marker, send_end_file_marker=True) number_of_found_files = reader._get_number_of_left_over_files() #p(number_of_found_files) # Check for stream_number=3 len(reader.getlazy(stream_number=3, adjust_to_cpu=False)).should.be.equal(3) len([ rowdict for gen in reader.getlazy(stream_number=3, adjust_to_cpu=False) for rowdict in gen if end_file_marker == rowdict ]).should.be.equal(number_of_found_files) # Check for stream_number=2 #p(reader.getlazy(stream_number=2, adjust_to_cpu=False)) len(reader.getlazy(stream_number=2, adjust_to_cpu=False)).should.be.equal(2) len([ rowdict for gen in reader.getlazy(stream_number=2, adjust_to_cpu=False) for rowdict in gen if end_file_marker == rowdict ]).should.be.equal(number_of_found_files) i = 0 for gen, fname in zip( reader.getlazy(stream_number=3, adjust_to_cpu=False, min_files_pro_stream=1), reversed(reader.files_to_read_orig)): for row_dict in gen: if row_dict == end_file_marker: i += 1 continue t = codecs.open(fname, "r", encoding="utf-8").read() #p((row_dict["text"],t)) assert row_dict["text"] == t assert isinstance(row_dict, dict) assert len(row_dict) == 6 assert 'text' in row_dict assert 'star_constellation' in row_dict assert 'working_area' in row_dict assert 'age' in row_dict assert 'id' in row_dict assert 'gender' in row_dict assert number_of_found_files == i
def test_getlazy_many_streams_from_txt_without_given_number_of_streams_adjusted_for_current_cpu_520( self): self.blogger_corpus() end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp, self.txt_blogger_small_fake_set), "txt", regex_template="blogger", mode=self.mode, end_file_marker=end_file_marker, send_end_file_marker=True) number_of_found_files = reader._get_number_of_left_over_files() #p(number_of_found_files) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=3)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 5)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=3)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=2)).should.be.equal( get_number_of_streams_adjust_cpu( 2, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=1)).should.be.equal( get_number_of_streams_adjust_cpu( 1, number_of_found_files, 4)) i = 0 for gen in reader.getlazy(stream_number=1000, adjust_to_cpu=True, min_files_pro_stream=1): for row_dict in gen: if row_dict == end_file_marker: i += 1 continue assert isinstance(row_dict, dict) assert len(row_dict) == 6 assert 'text' in row_dict assert 'star_constellation' in row_dict assert 'working_area' in row_dict assert 'age' in row_dict assert 'id' in row_dict assert 'gender' in row_dict #p((number_of_found_files, i)) assert number_of_found_files == i
def test_getlazy_many_streams_from_json_also_getted_from_zips_519(self): self.blogger_corpus() # Test 1: Check if number of getted files is correct end_file_marker = -1 reader = Reader(os.path.join(self.tempdir_blogger_corp), "json", mode=self.mode, read_from_zip=True, end_file_marker=end_file_marker, send_end_file_marker=True) number_of_found_files = reader._get_number_of_left_over_files() if number_of_found_files < 3: assert False if reader.files_number_in_zips != len(reader.files_to_read_orig): ## for this, it is important that the main folder of the test cases should be zipped!! That there is the same number of files assert False number_getted_files = len([ row for gen in reader.getlazy( stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5) for row in gen if row == end_file_marker ]) if number_of_found_files != number_getted_files: assert False #p((number_of_found_files, number_getted_files), "number_of_found_files != number_getted_files") # Test 2: check if right number of streams will be returned len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=5)).should.be.equal( get_number_of_streams_adjust_cpu( 5, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=3)).should.be.equal( get_number_of_streams_adjust_cpu( 3, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=2)).should.be.equal( get_number_of_streams_adjust_cpu( 2, number_of_found_files, 4)) len( reader.getlazy(stream_number=4, adjust_to_cpu=True, min_files_pro_stream=1)).should.be.equal( get_number_of_streams_adjust_cpu( 1, number_of_found_files, 4)) i = 0 for gen in reader.getlazy(stream_number=1000, adjust_to_cpu=True, min_files_pro_stream=1): for row_dict in gen: #p(row_dict) #i+=1 if row_dict == end_file_marker: i += 1 continue assert isinstance(row_dict, dict) assert len(row_dict) == 6 assert 'text' in row_dict assert 'star_constellation' in row_dict assert 'working_area' in row_dict assert 'age' in row_dict assert 'id' in row_dict assert 'gender' in row_dict assert number_of_found_files == i