def test_fieldfile_nomatch(self): fc = FieldFile(f("data/AandE_Data_2011-04-10.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/inventory.csv'), has_header=True) bw = FileWriter(self._col, reader=reader, parser=parser) with self.assertRaises(ValueError): bw.write()
def test_delimiter_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/AandE_Data_2011-04-10.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/AandE_Data_2011-04-10.csv'), has_header=True) bw = FileWriter(self._col, reader=reader, parser=parser) bw.write() self.assertEqual(self._col.count_documents({}) - start_count, 300)
def test_delimiter_no_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/10k.tff")) parser = LineToDictParser(fc) reader = FileReader(f("data/10k.txt"), has_header=False, delimiter="|") bw = FileWriter(self._col, reader=reader, parser=parser) bw.write() self.assertEqual(self._col.count_documents({}) - start_count, 10000)
def test_new_delimiter_and_timeformat_header(self): start_count = self._col.count_documents({}) fc = FieldFile(f("data/mot.tff")) parser = LineToDictParser(fc) reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|") self.assertTrue(type(reader.name) == str) bw = FileWriter(self._col, reader=reader, parser=parser) total = bw.write() lines = LineCounter(f('data/mot_test_set_small.csv')).line_count inserted_count = self._col.count_documents({}) - start_count self.assertEqual(inserted_count, total) self.assertEqual(inserted_count, lines)
def test_date(self): config = FieldFile(f("data/inventory_dates.tff")) parser = LineToDictParser(config, locator=False) # screws up comparison later if locator is true reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) docs_written = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1) # header must be subtracted self.assertEqual(self._col.count_documents({}), docs_written) nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")}) self.assertTrue(nuts_doc)
def test_generate_fieldfile(self): fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff") self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename) self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff")) parser = LineToDictParser(fc) reader = FileReader(f("data/inventory.csv"), has_header=True) start_count = self._col.count_documents({}) writer = FileWriter(self._col, reader=reader, parser=parser) write_count = writer.write() line_count = LineCounter(f("data/inventory.csv")).line_count new_inserted_count = self._col.count_documents({}) - start_count self.assertEqual(new_inserted_count, write_count) # header must be subtracted self.assertEqual(new_inserted_count, line_count - 1) # header must be subtracted os.unlink(f("data/inventory.testff"))
def test_http_import(self): if check_internet(): csv_parser = LineToDictParser(self._ff) reader = FileReader( "https://data.cityofnewyork.us/api/views/biws-g3hs/rows.csv?accessType=DOWNLOAD&bom=true&format=true&delimiter=%3B", has_header=True, delimiter=';') writer = FileWriter(self._collection, reader, csv_parser) before_doc_count = self._collection.count_documents({}) after_doc_count = writer.write(1000) self.assertEqual(after_doc_count - before_doc_count, 1000) else: print("Warning:No internet: test_http_import() skipped")
def test_local_import(self): reader = FileReader(f("data/2018_Yellow_Taxi_Trip_Data_1000.csv"), has_header=True, delimiter=";") before_doc_count = self._collection.count_documents({}) writer = FileWriter(self._collection, reader=reader, parser=self._parser) writer.write(10) after_doc_count = self._collection.count_documents({}) self.assertEqual(after_doc_count - before_doc_count, 10)
def pre_execute(self, arg): # print(f"'{arg}'") super().pre_execute(arg) self._log.info("Using collection:'{}'".format(self._collection.full_name)) if self._field_filename is None: self._field_filename = FieldFile.make_default_tff_name(arg) self._log.info(f"Using field file:'{self._field_filename}'") if not os.path.isfile(self._field_filename): raise OSError(f"No such field file:'{self._field_filename}'") self._fieldinfo = FieldFile(self._field_filename) self._reader = FileReader(arg, limit=self._limit, has_header=self._has_header, delimiter=self._delimiter) self._parser = LineToDictParser(self._fieldinfo, locator=self._locator, timestamp=self._timestamp, onerror=self._onerror) self._writer = FileWriter(self._collection,self._reader,self._parser)
class ImportCommand(Command): def __init__(self, collection:pymongo.collection, field_filename: str = None, delimiter:str = ",", has_header:bool = True, onerror: ErrorResponse = ErrorResponse.Warn, limit: int = 0, locator=False, timestamp: DocTimeStamp = DocTimeStamp.NO_TIMESTAMP, audit:bool= None, id:object= None): super().__init__(audit, id) self._log = logging.getLogger(__name__) self._collection = collection self._name = "import" self._field_filename = field_filename self._delimiter = delimiter self._has_header = has_header self._parser = None self._reader = None self._writer = None self._onerror = onerror self._limit = limit self._locator = locator self._timestamp = timestamp self._total_written = 0 def pre_execute(self, arg): # print(f"'{arg}'") super().pre_execute(arg) self._log.info("Using collection:'{}'".format(self._collection.full_name)) if self._field_filename is None: self._field_filename = FieldFile.make_default_tff_name(arg) self._log.info(f"Using field file:'{self._field_filename}'") if not os.path.isfile(self._field_filename): raise OSError(f"No such field file:'{self._field_filename}'") self._fieldinfo = FieldFile(self._field_filename) self._reader = FileReader(arg, limit=self._limit, has_header=self._has_header, delimiter=self._delimiter) self._parser = LineToDictParser(self._fieldinfo, locator=self._locator, timestamp=self._timestamp, onerror=self._onerror) self._writer = FileWriter(self._collection,self._reader,self._parser) def execute(self, arg): self._total_written = self._writer.write() return self._total_written def total_written(self): return self._total_written @property def fieldinfo(self): return self._fieldinfo def post_execute(self, arg): super().post_execute(arg) if self._audit: self._audit.add_command(self._id, self.name(), {"filename": arg}) if self._log: self._log.info("imported file: '%s'", arg)