def _split_helper(self, filename, split_size, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header) count = 0 part_total_size = 0 part_total_count = 0 for (part_name, line_count) in splitter.splitfile(split_size): splitter_part = File_Splitter(part_name) part_count = LineCounter(part_name).line_count self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count os.unlink(part_name) lc = LineCounter(filename) if has_header: self.assertEqual(part_total_count, lc.line_count - 1) else: self.assertEqual(part_total_count, lc.line_count)
def _test_file(self, count, doseol=False, filename="liner.txt", unlink=True): f = make_line_file(count=count, doseol=doseol, filename=filename) self.assertEqual(count, LineCounter(f).line_count) if unlink: os.unlink(f)
def pwc(*argv): parser = argparse.ArgumentParser() parser.add_argument("filenames", nargs="*", help='list of files') args = parser.parse_args(*argv) total_count = 0 total_size = 0 if args.filenames: print("lines\tbytes\tfilename") for filename in args.filenames: counter = LineCounter(filename) total_count = total_count + counter.line_count total_size = total_size + counter.file_size() print("%i\t%i\t%s" % (counter.line_count, counter.file_size(), filename)) if len(args.filenames) > 1: print("%i\t%i\ttotal" % (total_count, total_size))
def test_date_format(self): col = self._database["mot"] start_count = col.count_documents({}) fp = FileProcessor(col, delimiter='|') fp.processOneFile(f("data/mot_time_format_test.txt")) lines = LineCounter(f("data/mot_time_format_test.txt")).line_count self.assertEqual(lines, col.count_documents({}) - start_count) self.assertTrue(col.find_one({"test_id": 1077}))
def test_mot_data(self): col = self._database["mot"] start_count = col.count_documents({}) fp = FileProcessor(col, '|') fp.processOneFile(f("data/10k.txt")) lines = LineCounter(f("data/10k.txt")).line_count self.assertEqual(lines, col.count_documents({}) - start_count) self.assertTrue(col.find_one({"test_id": 114624}))
def test_A_and_E_data(self): col = self._database["AandE"] start_count = col.count_documents({}) fp = FileProcessor(col, ',', onerror="ignore") fp.processOneFile(input_filename=f("data/AandE_Data_2011-04-10.csv"), hasheader=True) lines = LineCounter(f("data/AandE_Data_2011-04-10.csv")).line_count self.assertEqual(lines, col.count_documents({}) - start_count + 1) self.assertTrue(col.find_one({"Code": "RA4"}))
def test_new_delimiter_and_timeformat_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/mot.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|") self.assertTrue(type(reader.name) == str) bw = FileWriter(self._col, reader=reader, parser=parser) total = bw.write() lines = LineCounter(f('data/mot_test_set_small.csv')).line_count inserted_count = self._col.count_documents({}) - start_count self.assertEqual(inserted_count, total) self.assertEqual(inserted_count, lines)
def test_Import_Command(self): self._audit = Audit(database=self._client["TEST_AUDIT"]) batch_id = self._audit.start_batch({"test": "test_batch"}) collection = self._database["import_test"] start_size = collection.count_documents({}) size_10k = LineCounter(f("data/10k.txt")).line_count size_120 = LineCounter(f("data/120lines.txt")).line_count cmd = ImportCommand(audit=self._audit, id=batch_id, collection=collection, field_filename=f("data/10k.tff"), delimiter="|", has_header=False, onerror="warn", limit=0) cmd.run(f("data/10k.txt"), f("data/120lines.txt")) new_size = collection.count_documents({}) self.assertEqual(size_10k + size_120, new_size - start_size) self._audit.end_batch(batch_id)
def test_property_prices(self): start_count = self._col.count_documents({}) fp = FileProcessor(self._col, ',') try: fp.processOneFile(f("data/uk_property_prices.csv")) except pymongo.errors.BulkWriteError as e: print(e) raise lines = LineCounter(f("data/uk_property_prices.csv")).line_count self.assertEqual(lines, self._col.count_documents({}) - start_count) self.assertTrue(self._col.find_one({"Postcode": "NG10 5NN"}))
def test_date(self): config = FieldFile(f("data/inventory_dates.tff")) parser = LineToDictParser(config, locator=False) # screws up comparison later if locator is true reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) docs_written = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1) # header must be subtracted self.assertEqual(self._col.count_documents({}), docs_written) nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")}) self.assertTrue(nuts_doc)
def test_gdelt_data(self): col = self._database["GDELT"] start_count = col.count_documents({}) fp = FileProcessor(col, onerror="ignore", delimiter="tab") fp.processOneFile(input_filename=f("data/gdelt.tsv"), hasheader=False, field_filename=f("data/GDELT_columns.tff")) lines = LineCounter(f("data/gdelt.tsv")).line_count self.assertEqual(lines, col.count_documents({}) - start_count) self.assertTrue( col.find_one({ "SOURCEURL": "https://www.standardspeaker.com/news/dream-factory-director-retiring-1.2467094" }))
def test_generate_fieldfile(self): fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff") self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename) self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff")) parser = LineToDictParser(fc) reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) write_count = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count new_inserted_count = self._col.count_documents({}) - start_count self.assertEqual(new_inserted_count, write_count) # header must be subtracted self.assertEqual(new_inserted_count, line_count - 1) # header must be subtracted os.unlink(f("data/inventory.testff"))
def _auto_split_helper(self, filename, lines, split_count, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header=has_header) part_total_count = 0 total_line_count = splitter.line_count self.assertEqual(total_line_count, lines) for (part_name, line_count) in splitter.autosplit(split_count): part_count = LineCounter(part_name).line_count self.assertGreater(part_count, 0) self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count os.unlink(part_name) if has_header: self.assertEqual(part_total_count, lines - 1) else: self.assertEqual(part_total_count, lines)