コード例 #1
0
    def _split_helper(self,
                      filename,
                      split_size,
                      has_header=False,
                      dos_adjust=False):

        splitter = File_Splitter(filename, has_header)

        count = 0
        part_total_size = 0
        part_total_count = 0

        for (part_name, line_count) in splitter.splitfile(split_size):
            splitter_part = File_Splitter(part_name)
            part_count = LineCounter(part_name).line_count
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            os.unlink(part_name)

        lc = LineCounter(filename)

        if has_header:
            self.assertEqual(part_total_count, lc.line_count - 1)
        else:
            self.assertEqual(part_total_count, lc.line_count)
コード例 #2
0
 def _test_file(self,
                count,
                doseol=False,
                filename="liner.txt",
                unlink=True):
     f = make_line_file(count=count, doseol=doseol, filename=filename)
     self.assertEqual(count, LineCounter(f).line_count)
     if unlink:
         os.unlink(f)
コード例 #3
0
ファイル: pwc.py プロジェクト: judy2k/pymongoimport
def pwc(*argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("filenames", nargs="*", help='list of files')
    args = parser.parse_args(*argv)

    total_count = 0
    total_size = 0
    if args.filenames:
        print("lines\tbytes\tfilename")
    for filename in args.filenames:
        counter = LineCounter(filename)
        total_count = total_count + counter.line_count
        total_size = total_size + counter.file_size()

        print("%i\t%i\t%s" %
              (counter.line_count, counter.file_size(), filename))
    if len(args.filenames) > 1:
        print("%i\t%i\ttotal" % (total_count, total_size))
コード例 #4
0
    def test_date_format(self):

        col = self._database["mot"]
        start_count = col.count_documents({})
        fp = FileProcessor(col, delimiter='|')
        fp.processOneFile(f("data/mot_time_format_test.txt"))
        lines = LineCounter(f("data/mot_time_format_test.txt")).line_count
        self.assertEqual(lines, col.count_documents({}) - start_count)
        self.assertTrue(col.find_one({"test_id": 1077}))
コード例 #5
0
    def test_mot_data(self):

        col = self._database["mot"]
        start_count = col.count_documents({})
        fp = FileProcessor(col, '|')
        fp.processOneFile(f("data/10k.txt"))
        lines = LineCounter(f("data/10k.txt")).line_count
        self.assertEqual(lines, col.count_documents({}) - start_count)
        self.assertTrue(col.find_one({"test_id": 114624}))
コード例 #6
0
    def test_A_and_E_data(self):

        col = self._database["AandE"]
        start_count = col.count_documents({})
        fp = FileProcessor(col, ',', onerror="ignore")
        fp.processOneFile(input_filename=f("data/AandE_Data_2011-04-10.csv"),
                          hasheader=True)
        lines = LineCounter(f("data/AandE_Data_2011-04-10.csv")).line_count
        self.assertEqual(lines, col.count_documents({}) - start_count + 1)
        self.assertTrue(col.find_one({"Code": "RA4"}))
コード例 #7
0
 def test_new_delimiter_and_timeformat_header(self):
     start_count = self._col.count_documents({})
     fc = FieldFile(f("data/mot.tff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f('data/mot_test_set_small.csv'), has_header=False, delimiter="|")
     self.assertTrue(type(reader.name) == str)
     bw = FileWriter(self._col, reader=reader, parser=parser)
     total = bw.write()
     lines = LineCounter(f('data/mot_test_set_small.csv')).line_count
     inserted_count = self._col.count_documents({}) - start_count
     self.assertEqual(inserted_count, total)
     self.assertEqual(inserted_count, lines)
コード例 #8
0
ファイル: test_command.py プロジェクト: judy2k/pymongoimport
    def test_Import_Command(self):
        self._audit = Audit(database=self._client["TEST_AUDIT"])
        batch_id = self._audit.start_batch({"test": "test_batch"})
        collection = self._database["import_test"]

        start_size = collection.count_documents({})
        size_10k = LineCounter(f("data/10k.txt")).line_count
        size_120 = LineCounter(f("data/120lines.txt")).line_count
        cmd = ImportCommand(audit=self._audit,
                            id=batch_id,
                            collection=collection,
                            field_filename=f("data/10k.tff"),
                            delimiter="|",
                            has_header=False,
                            onerror="warn",
                            limit=0)

        cmd.run(f("data/10k.txt"), f("data/120lines.txt"))
        new_size = collection.count_documents({})
        self.assertEqual(size_10k + size_120, new_size - start_size)

        self._audit.end_batch(batch_id)
コード例 #9
0
    def test_property_prices(self):

        start_count = self._col.count_documents({})
        fp = FileProcessor(self._col, ',')
        try:
            fp.processOneFile(f("data/uk_property_prices.csv"))
        except pymongo.errors.BulkWriteError as e:
            print(e)
            raise
        lines = LineCounter(f("data/uk_property_prices.csv")).line_count
        self.assertEqual(lines, self._col.count_documents({}) - start_count)

        self.assertTrue(self._col.find_one({"Postcode": "NG10 5NN"}))
コード例 #10
0
    def test_date(self):
        config = FieldFile(f("data/inventory_dates.tff"))
        parser = LineToDictParser(config, locator=False)  # screws up comparison later if locator is true
        reader = FileReader(f("data/inventory.csv"), has_header=True)
        start_count = self._col.count_documents({})
        writer = FileWriter(self._col, reader=reader, parser=parser)
        docs_written = writer.write()
        line_count = LineCounter(f("data/inventory.csv")).line_count
        self.assertEqual(self._col.count_documents({}) - start_count, line_count - 1)  # header must be subtracted
        self.assertEqual(self._col.count_documents({}), docs_written)

        nuts_doc = self._col.find_one({"Last Order": dateutil.parser.parse("29-Feb-2016")})
        self.assertTrue(nuts_doc)
コード例 #11
0
 def test_gdelt_data(self):
     col = self._database["GDELT"]
     start_count = col.count_documents({})
     fp = FileProcessor(col, onerror="ignore", delimiter="tab")
     fp.processOneFile(input_filename=f("data/gdelt.tsv"),
                       hasheader=False,
                       field_filename=f("data/GDELT_columns.tff"))
     lines = LineCounter(f("data/gdelt.tsv")).line_count
     self.assertEqual(lines, col.count_documents({}) - start_count)
     self.assertTrue(
         col.find_one({
             "SOURCEURL":
             "https://www.standardspeaker.com/news/dream-factory-director-retiring-1.2467094"
         }))
コード例 #12
0
 def test_generate_fieldfile(self):
     fc = FieldFile.generate_field_file(f("data/inventory.csv"), ext="testff")
     self.assertEqual(fc.field_filename, f("data/inventory.testff"), fc.field_filename)
     self.assertTrue(os.path.isfile(f("data/inventory.testff")), f("data/inventory.testff"))
     parser = LineToDictParser(fc)
     reader = FileReader(f("data/inventory.csv"), has_header=True)
     start_count = self._col.count_documents({})
     writer = FileWriter(self._col, reader=reader, parser=parser)
     write_count = writer.write()
     line_count = LineCounter(f("data/inventory.csv")).line_count
     new_inserted_count = self._col.count_documents({}) - start_count
     self.assertEqual(new_inserted_count, write_count)  # header must be subtracted
     self.assertEqual(new_inserted_count, line_count - 1)  # header must be subtracted
     os.unlink(f("data/inventory.testff"))
コード例 #13
0
    def _auto_split_helper(self,
                           filename,
                           lines,
                           split_count,
                           has_header=False,
                           dos_adjust=False):

        splitter = File_Splitter(filename, has_header=has_header)
        part_total_count = 0
        total_line_count = splitter.line_count
        self.assertEqual(total_line_count, lines)
        for (part_name, line_count) in splitter.autosplit(split_count):
            part_count = LineCounter(part_name).line_count
            self.assertGreater(part_count, 0)
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            os.unlink(part_name)

        if has_header:
            self.assertEqual(part_total_count, lines - 1)
        else:
            self.assertEqual(part_total_count, lines)