Ejemplo n.º 1
0
    def test_nyc_2016_genfieldfile(self):

        fc = FieldFile.generate_field_file(f('data/2018_Yellow_Taxi_Trip_Data_1000.csv'),
                                           delimiter=";")
        fc_new = FieldFile(fc.field_filename)

        self.assertEqual(fc.fields(), fc_new.fields())

        os.unlink(fc.field_filename)
Ejemplo n.º 2
0
 def test_fieldfile_nomatch(self):
     fc = FieldFile(f("data/AandE_Data_2011-04-10.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/inventory.csv'), has_header=True)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     with self.assertRaises(ValueError):
         bw.write()
Ejemplo n.º 3
0
 def test_delimiter_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/AandE_Data_2011-04-10.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/AandE_Data_2011-04-10.csv'), has_header=True)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     bw.write()
     self.assertEqual(self._col.count_documents({}) - start_count, 300)
Ejemplo n.º 4
0
 def test_delimiter_no_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/10k.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f("data/10k.txt"), has_header=False, delimiter="|")
     bw = FileWriter(self._col, reader=reader, parser=parser)
     bw.write()
     self.assertEqual(self._col.count_documents({}) - start_count, 10000)
Ejemplo n.º 5
0
    def test_FieldConfig(self):
        fc = FieldFile(f("data/test_fieldconfig.tff"))
        self.assertEqual(len(fc.fields()), 4)

        self.assertEqual(fc.fields()[0], "Test 1")
        self.assertEqual(fc.fields()[3], "Test 4")

        fc:FieldFile = FieldFile(f("data/uk_property_prices.tff"))
        self.assertEqual(len(fc.fields()), 16)

        self.assertEqual(fc.fields()[0], "txn")
        self.assertEqual(fc.fields()[2], "Date of Transfer")
        self.assertEqual(fc.fields()[14], "PPD Category Type")
Ejemplo n.º 6
0
 def test_new_delimiter_and_timeformat_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/mot.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|")
     self.assertTrue(type(reader.name) == str)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     total = bw.write()
     lines = LineCounter(f('data/mot_test_set_small.csv')).line_count
     inserted_count = self._col.count_documents({}) - start_count
     self.assertEqual(inserted_count, total)
     self.assertEqual(inserted_count, lines)
Ejemplo n.º 7
0
    def test_Config_File(self):
        ff = FieldFile(f("data/10k.tff"))
        self.assertTrue("test_id" in ff.fields())
        self.assertTrue("cylinder_capacity" in ff.fields())

        self.assertEqual(ff.type_value("test_id"), "int")
        self.assertEqual(ff.type_value("test_date"), "datetime")
Ejemplo n.º 8
0
    def test_date(self):
        config = FieldFile(f("data/inventory_dates.tff"))
        parser = LineToDictParser(config, locator=False)  # screws up comparison later if locator is true
        reader = FileReader(f("data/inventory.csv"), has_header=True)
        start_count = self._col.count_documents({})
        writer = FileWriter(self._col, reader=reader, parser=parser)
        docs_written = writer.write()
        line_count = LineCounter(f("data/inventory.csv")).line_count
        self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1)  # header must be subtracted
        self.assertEqual(self._col.count_documents({}), docs_written)

        nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")})
        self.assertTrue(nuts_doc)
Ejemplo n.º 9
0
    def processOneFile(self, input_filename, field_filename=None, hasheader=False, restart=False, batchID=None):

        if not field_filename:
            field_filename = FieldFile.make_default_tff_name(input_filename)

        cmd = ImportCommand(collection=self._collection,
                            field_filename=field_filename,
                            delimiter=self._delimiter,
                            has_header=hasheader,
                            onerror=self._onerror,
                            limit=self._limit)

        cmd.run(input_filename)
        return cmd.total_written()
Ejemplo n.º 10
0
 def test_generate_fieldfile(self):
     fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff")
     self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename)
     self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f("data/inventory.csv"), has_header=True)
     start_count = self._col.count_documents({})
     writer = FileWriter(self._col, reader=reader, parser=parser)
     write_count = writer.write()
     line_count = LineCounter(f("data/inventory.csv")).line_count
     new_inserted_count = self._col.count_documents({}) - start_count
     self.assertEqual(new_inserted_count, write_count)  # header must be subtracted
     self.assertEqual(new_inserted_count, line_count - 1)  # header must be subtracted
     os.unlink(f("data/inventory.testff"))
Ejemplo n.º 11
0
    def pre_execute(self, arg):
        # print(f"'{arg}'")
        super().pre_execute(arg)
        self._log.info("Using collection:'{}'".format(self._collection.full_name))

        if self._field_filename is None:
            self._field_filename = FieldFile.make_default_tff_name(arg)

        self._log.info(f"Using field file:'{self._field_filename}'")

        if not os.path.isfile(self._field_filename):
            raise OSError(f"No such field file:'{self._field_filename}'")

        self._fieldinfo = FieldFile(self._field_filename)

        self._reader = FileReader(arg,
                                  limit=self._limit,
                                  has_header=self._has_header,
                                  delimiter=self._delimiter)
        self._parser = LineToDictParser(self._fieldinfo,
                                        locator=self._locator,
                                        timestamp=self._timestamp,
                                        onerror=self._onerror)
        self._writer = FileWriter(self._collection,self._reader,self._parser)
Ejemplo n.º 12
0
    def run(self, filename):
        if not self._log:
            self._log = Logger(self._args.logname, self._args.loglevel).log()

        if not self._args.silent:
            Logger.add_stream_handler(self._args.logname)

        self._log.info("Started pymongoimport")

        if self._field_filename is None:
            self._field_filename = FieldFile.make_default_tff_name(filename)

        if self._write_concern == 0:  # pymongo won't allow other args with w=0 even if they are false
            client = pymongo.MongoClient(self._host, w=self._write_concern)
        else:
            client = pymongo.MongoClient(self._host, w=self._write_concern, fsync=self._fsync, j=self._journal)

        database = client[self._database_name]
        self._collection = database[self._collection_name]

        self._log.info(f"Write concern : {self._write_concern}")
        self._log.info(f"journal       : {self._journal}")
        self._log.info(f"fsync         : {self._fsync}")
        self._log.info(f"has header    : {self._has_header}")

        cmd = ImportCommand(collection=self._collection,
                            field_filename=self._field_filename,
                            delimiter=self._delimiter,
                            has_header=self._has_header,
                            onerror=self._onerror,
                            limit=self._limit,
                            audit=self._audit,
                            locator=self._locator,
                            timestamp=self._timestamp,
                            id=self._batch_ID)

        cmd.run(filename)

        return 1
Ejemplo n.º 13
0
    def test_http_generate_fieldfile(self):
        if check_internet():
            # Demographic_Statistics_By_Zip_Code.csv
            url = "https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.csv?accessType=DOWNLOAD"

            ff_file = FieldFile.generate_field_file(
                url,
                delimiter=",",
                ff_filename=f("data/Demographic_Statistics_By_Zip_Code.tff"))

            self.assertTrue("JURISDICTION NAME" in ff_file.fields(),
                            ff_file.fields())
            self.assertEqual(len(ff_file.fields()), 46)
            self.assertTrue(
                "PERCENT PUBLIC ASSISTANCE TOTAL" in ff_file.fields())

            os.unlink(f("data/Demographic_Statistics_By_Zip_Code.tff"))

        else:
            print(
                "Warning:No internet: Skipping test for generating field files from URLs"
            )
Ejemplo n.º 14
0
    def test_reader(self):
        fc = FieldFile.generate_field_file(f("data/inventory.csv"), f("data/inventory_test.tff"))
        ff = FieldFile(fc.field_filename)
        reader = FileReader(f("data/inventory.csv"), has_header=True)
        parser = LineToDictParser(ff)
        for i, row in enumerate(reader.readline(), 1):
            doc = parser.parse_list(row, i)
            for field in ff.fields():
                self.assertTrue(field in doc, f"'{field}'")

        os.unlink(fc.field_filename)

        ff = FieldFile(f("data/uk_property_prices.tff"))
        reader = FileReader(f("data/uk_property_prices.csv"), has_header=True)

        parser = LineToDictParser(ff)
        for i, row in enumerate(reader.readline(), i):
            doc = parser.parse_list(row, i)
            for field in ff.fields():
                if field == "txn":  # converted to _id field
                    continue
                self.assertTrue(field in doc, f"{field} not present")
                self.assertTrue(type(doc["Price"]) == int)
                self.assertTrue(type(doc["Date of Transfer"]) == datetime)
Ejemplo n.º 15
0
def pymongoimport_main(input_args=None):
    """
    Expect to recieve an array of args
    
    1.3 : Added lots of support for the NHS Public Data sets project. --addfilename and --addtimestamp.
    Also we now fail back to string when type conversions fail.
    
    >>> pymongoimport_main( [ 'test_set_small.txt' ] )
    database: test, collection: test
    files ['test_set_small.txt']
    Processing : test_set_small.txt
    Completed processing : test_set_small.txt, (100 records)
    Processed test_set_small.txt
    """

    usage_message = """
    
    pymongoimport is a python program that will import data into a mongodb
    database (default 'test' ) and a mongodb collection (default 'test' ).
    
    Each file in the input list must correspond to a fieldfile format that is
    common across all the files. The fieldfile is specified by the 
    --fieldfile parameter.
    
    An example run:
    
    python pymongoimport.py --database demo --collection demo --fieldfile test_set_small.ff test_set_small.txt
    """

    # if input_args:
    #     print("args: {}".format( " ".join(input_args)))

    parser = argparse.ArgumentParser(usage=usage_message)
    parser = add_standard_args(parser)
    # print( "Argv: %s" % argv )
    # print(argv)

    if input_args:
        cmd = input_args
        args = parser.parse_args(cmd)
    else:
        cmd = tuple(sys.argv[1:])
        args = parser.parse_args(cmd)
        cmd_args = " ".join(cmd)
    # print("args: %s" % args)

    log = Logger(args.logname, args.loglevel).log()

    # Logger.add_file_handler(args.logname)

    if not args.silent:
        Logger.add_stream_handler(args.logname)

    #print(args.filenames)

    if args.filelist:
        try:
            with open(args.filelist) as input_file:
                for line in input_file.readlines():
                    args.filenames.append(line)
        except OSError as e:
            log.error(f"{e}")

    if args.writeconcern == 0:  # pymongo won't allow other args with w=0 even if they are false
        client = pymongo.MongoClient(args.host, w=args.writeconcern)
    else:
        client = pymongo.MongoClient(args.host, w=args.writeconcern, fsync=args.fsync, j=args.journal)

    if args.genfieldfile:
        args.has_header = True
        log.info('Forcing has_header true for --genfieldfile')
        cmd = GenerateFieldfileCommand(field_filename=args.fieldfile, delimiter=args.delimiter)
        for i in args.filenames:
            cmd.run(i)

    if args.audit:
        audit = Audit(client=client)
        batch_ID = audit.start_batch({"command": input_args})
    else:
        audit = None
        batch_ID = None

    if args.database:
        database_name = args.database
    else:
        database_name = "PYIM"

    if args.collection:
        collection_name = args.collection
    else:
        collection_name = "ported"

    database = client[database_name]
    collection = database[collection_name]

    if args.drop:
        if args.restart:
            log.info("Warning --restart overrides --drop ignoring drop commmand")
        else:
            cmd = Drop_Command(audit=audit, id=batch_ID, database=database)
            cmd.run(collection_name)

    if args.fieldinfo:
        cfg = FieldFile(args.fieldinfo)

        for i,field in enumerate(cfg.fields(), 1 ):
            print(f"{i:3}. {field:25}:{cfg.type_value(field)}")
        print(f"Total fields: {len(cfg.fields())}")

    if not args.genfieldfile:
        if args.filenames :

            if args.audit:
                audit = Audit(client=client)
                batch_ID = audit.start_batch({"command": sys.argv})
            else:
                audit = None
                batch_ID = None

            process = Importer(audit, batch_ID, args)

            for i in args.filenames:
                try:
                    process.run(i)
                except OSError as e:
                    log.error(f"{e}")
                except exceptions.HTTPError as e:
                    log.error(f"{e}")


            if args.audit:
                audit.end_batch(batch_ID)

        else:
            log.info("No input files: Nothing to do")

    return 1
Ejemplo n.º 16
0
    def test_generate_field_filename(self):
        gfc = FieldFile.generate_field_file(f('data/inventory.csv'), ext="xx")
        self.assertEqual(gfc.field_filename, f("data/inventory.xx"))
        rfc = FieldFile(gfc.field_filename)
        self.assertTrue("Inventory Item" in rfc.fields())
        self.assertTrue("Amount" in rfc.fields())
        self.assertTrue("Last Order", rfc.fields())
        self.assertEqual(len(rfc.fields()), 3)
        os.unlink(gfc.field_filename)

        fc = FieldFile.generate_field_file(f('data/inventory.csv'))
        self.assertEqual(fc.field_filename, f("data/inventory.tff"))
        os.unlink(fc.field_filename)

        fc = FieldFile.generate_field_file(f('data/inventory.csv.1'))
        self.assertEqual(fc.field_filename, f("data/inventory.csv.tff"), fc.field_filename)
        os.unlink(fc.field_filename)

        fc = FieldFile.generate_field_file(f('data/yellow_tripdata_2015-01-06-200k.csv.1'))
        self.assertEqual(fc.field_filename, f("data/yellow_tripdata_2015-01-06-200k.csv.tff"), fc.field_filename)
        os.unlink(fc.field_filename)

        fc = FieldFile.generate_field_file(f('data/yellow_tripdata_2015-01-06-200k.csv.10'))
        self.assertEqual(fc.field_filename, f("data/yellow_tripdata_2015-01-06-200k.csv.tff"), fc.field_filename)
        os.unlink(fc.field_filename)

        fc = FieldFile.generate_field_file(f('data/test_results_2016_10.txt.1'))
        self.assertEqual(fc.field_filename, f("data/test_results_2016_10.txt.tff"), fc.field_filename)
        os.unlink(fc.field_filename)
Ejemplo n.º 17
0
 def execute(self, arg):
     ff = FieldFile.generate_field_file(csv_filename=arg, ff_filename=self._field_filename)
     self._field_filename = ff.field_filename
     return self._field_filename
Ejemplo n.º 18
0
 def testFieldDict(self):
     d = FieldFile(f("data/testresults.tff")).field_dict
     self.assertTrue("TestID" in d)
     self.assertTrue("FirstUseDate" in d)
     self.assertTrue("Colour" in d)
     self.assertTrue(d["TestID"]["type"] == "int")
Ejemplo n.º 19
0
 def test_property_prices(self):
     ff = FieldFile(f("data/uk_property_prices.tff"))
     self.assertTrue(ff.has_new_name("txn"))
     self.assertFalse(ff.name_value("txn") is None)
Ejemplo n.º 20
0
 def setUp(self):
     self._client = pymongo.MongoClient()
     self._db = self._client["PYIM_HTTP_TEST"]
     self._collection = self._db["PYIM_HTTP_TEST"]
     self._ff = FieldFile(f("data/2018_Yellow_Taxi_Trip_Data_1000.ff"))
     self._parser = LineToDictParser(self._ff)