コード例 #1
0
    def _auto_split_helper(self,
                           filename,
                           split_count,
                           has_header=False,
                           dos_adjust=False):

        splitter = File_Splitter(filename, has_header=has_header)
        count = 0
        part_total_size = 0
        part_total_count = 0
        total_line_count = splitter.no_of_lines(include_header=has_header)

        for (part_name, line_count) in splitter.autosplit(split_count):
            splitter_part = File_Splitter(part_name)
            part_count = splitter_part.no_of_lines()
            self.assertGreater(part_count, 0)
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            part_total_size = part_total_size + splitter_part.size()
            os.unlink(part_name)

        self.assertEqual(part_total_count,
                         splitter.no_of_lines(include_header=not has_header))
        self.assertEqual(
            part_total_size,
            splitter.size(include_header=False, dos_adjust=dos_adjust))
コード例 #2
0
 def test_count_lines(self):
     self.assertEqual(3, File_Splitter("data/threelines.txt").no_of_lines())
     self.assertEqual(0, File_Splitter("data/emptyfile.txt").no_of_lines())
     self.assertEqual(4, File_Splitter("data/fourlines.txt").no_of_lines())
     self.assertEqual(5, File_Splitter("data/inventory.csv").no_of_lines())
     self.assertEqual(
         4,
         File_Splitter("data/inventory.csv").no_of_lines(
             include_header=False))
コード例 #3
0
    def test_copy_file(self):
        splitter = File_Splitter(f("data/AandE_Data_2011-04-10.csv"), has_header=True)
        self.assertEqual(splitter.file_type(), FileType.DOS)
        (_, total_lines)=splitter.copy_file(f("data/AandE_Data_2011-04-10.csv") + ".1", ignore_header=True)

        #
        # we subtract 1 char for each line to account for dos \r\n.
        # We subtract the header line
        # because we have already subtracted all the lines in the original including the header line
        # (which has a \r\n) we add one back to account for this extra char.
        #
        self.assertEqual(os.path.getsize(f("data/AandE_Data_2011-04-10.csv.1")), splitter.size() - splitter.line_count() - len( splitter.header_line()) +1)
コード例 #4
0
    def test_generate_fieldfile(self):

        fc_filename = FieldConfig.generate_field_file("data/inventory.csv",
                                                      ext="testff")
        self.assertTrue(os.path.isfile("data/inventory.testff"))
        fc = FieldConfig(fc_filename, hasheader=True)

        start_count = self._col.count()
        writer = File_Writer(self._col, fc)
        writer.insert_file("data/inventory.csv")
        line_count = File_Splitter("data/inventory.csv").count_lines()
        self.assertEqual(self._col.count() - start_count,
                         line_count - 1)  # header must be subtracted

        os.unlink("data/inventory.testff")

        with open("data/inventory.csv", "r") as f:
            if fc.hasheader():
                _ = f.readline()
            reader = fc.get_dict_reader(f)
            fields = fc.fields()
            for row in reader:
                # print( row )
                for f in fields:
                    row[f] = fc.type_convert(
                        row[f],
                        fc.typeData(f))  # remember we type convert fields

                doc = self._col.find_one(row)
                self.assertTrue(doc)
コード例 #5
0
 def test_A_and_E_data(self):
     start_count = self._col.count()
     fp = FileProcessor(self._col, ',', onerror="ignore")
     fp.processOneFile(input_filename="data/AandE_Data_2011-04-10.csv",
                       hasheader=True)
     lines = File_Splitter("data/AandE_Data_2011-04-10.csv").count_lines()
     self.assertEqual(lines, self._col.count() - start_count + 1)
     self.assertTrue(self._col.find_one({"Code": "RA4"}))
コード例 #6
0
    def test_mot_data(self):

        start_count = self._col.count()
        fp = FileProcessor(self._col, '|')
        fp.processOneFile("data/10k.txt")
        lines = File_Splitter("data/10k.txt").count_lines()
        self.assertEqual(lines, self._col.count() - start_count)
        self.assertTrue(self._col.find_one({"TestID": 114624}))
コード例 #7
0
    def _split_helper(self, filename, split_size, has_header=False, dos_adjust=False):

        splitter = File_Splitter(filename, has_header)

        count = 0
        part_total_size = 0
        part_total_count = 0

        for (part_name, line_count) in splitter.splitfile(split_size):
            splitter_part = File_Splitter( part_name)
            part_count = LineCounter(part_name).line_count()
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            part_total_size = part_total_size + splitter_part.size()
            os.unlink(part_name)

        lc = LineCounter(filename)

        if has_header:
            self.assertEqual(part_total_count, lc.line_count() - 1)
        else:
            self.assertEqual(part_total_count, lc.line_count())

        if dos_adjust:
            self.assertEqual(part_total_size, splitter.size() - lc.line_count() - len( splitter.header_line()) +1)
        else:
            self.assertEqual(part_total_size, splitter.size() - len( splitter.header_line()))
コード例 #8
0
 def _compare_input_output(self, input_filename, output_filenames):
     original_count = 0
     file_piece_count = 0
     with open(input_filename, "r") as original_file:
         for filename in File_Splitter.shim_names(output_filenames):
             with open(filename, "r") as file_piece:
                 for line in file_piece:
                     left = original_file.readline()
                     original_count = original_count + 1
                     right = line
                     file_piece_count = file_piece_count + 1
                     self.assertEqual(left, right)
             os.unlink(filename)
コード例 #9
0
    def test_property_prices(self):

        start_count = self._col.count()
        fp = FileProcessor(self._col, ',')
        try:
            fp.processOneFile("data/uk_property_prices.csv")
        except pymongo.errors.BulkWriteError as e:
            print(e)
            raise
        lines = File_Splitter("data/uk_property_prices.csv").count_lines()
        self.assertEqual(lines, self._col.count() - start_count)

        self.assertTrue(self._col.find_one({"Postcode": "NG10 5NN"}))
コード例 #10
0
 def _compare_input_output(self, input_filename, output_filenames, has_header=False):
     original_count = 0
     file_piece_count = 0
     with open(input_filename, "r") as original_file:
         if has_header:
             _ = original_file.readline()
         for filename in File_Splitter.shim_names(output_filenames):
             with open(filename, "r") as file_piece:
                 for line in file_piece:
                     left = original_file.readline()
                     original_count = original_count + 1 
                     right = line
                     file_piece_count = file_piece_count + 1 
                     self.assertEqual(left, right)
             os.unlink(filename)
コード例 #11
0
    def test_copy_file(self):
        splitter = File_Splitter("data/AandE_Data_2011-04-10.csv",
                                 has_header=True)
        self.assertEqual(splitter.file_type(), File_Type.DOS)
        (_, total_lines) = splitter.copy_file(
            "data/AandE_Data_2011-04-10.csv" + ".1", ignore_header=True)

        self.assertEqual(
            File_Splitter("data/AandE_Data_2011-04-10.csv.1").size(),
            splitter.size(include_header=False, dos_adjust=True))
コード例 #12
0
def split_file(*argv):
    usage_message = '''
    
Split a text file into seperate pieces. if you specify 
autosplit then the program will use the first ten lines 
to calcuate an average line size and use that to 
determine the rough number of splits.

if you use **--splitsize** then the file will be split 
using **--splitsize** chunks until it is consumed.
'''

    parser = argparse.ArgumentParser(usage=usage_message)

    parser.add_argument('-v", ''--version', action='version', version='%(prog)s ' + __VERSION__)
    parser.add_argument("--autosplit", type=int,
                        help="split file based on loooking at the first ten lines and overall file size [default : %(default)s]")
    parser.add_argument('--hasheader', default=False, action="store_true",
                        help="Use header line for column names [default: %(default)s]")
    parser.add_argument("--splitsize", type=int, help="Split file into chunks of this size")
    parser.add_argument("filenames", nargs="*", help='list of files')
    args = parser.parse_args(*argv)

    if len(args.filenames) == 0:
        print("No input file specified to split")
        sys.exit(0)
    elif len(args.filenames) > 1:
        print("More than one input file specified ( %s ) only splitting the first file:'%s'" %
              (" ".join(args.filenames), args.filenames[0]))

    splitter = File_Splitter(args.filenames[0], args.hasheader)
    if args.autosplit:
        print("Autosplitting: '%s'" % args.filenames[0])
        files = splitter.autosplit(args.autosplit)
    else:
        print("Splitting '%s' using %i splitsize" % (args.filenames[0], args.splitsize))
        files = splitter.split_file(args.splitsize)
    # print( "Split '%s' into %i parts"  % ( args.filenames[ 0 ], len( files )))
    count = 1
    total_size = 0
    results = list(files)
    for (i, lines) in results:
        size = os.path.getsize(i)
        total_size = total_size + size
        print("%i. '%s'. Lines : %i, Size: %i" % (count, i, lines, size))
        count = count + 1

    if total_size != splitter.no_header_size():
        raise ValueError("Filesize of original and pieces does not match: total_size: %i, no header split_size: %i" % (
        total_size, splitter.no_header_size()))

    return results
コード例 #13
0
ファイル: pwc.py プロジェクト: mrlynn/pymongo_import
def pwc(*argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("filenames", nargs="*", help='list of files')
    args = parser.parse_args(*argv)

    line_count = 0
    total_count = 0
    total_size = 0
    size = 0
    if args.filenames:
        print("lines\tbytes\tfilename")
    for filename in args.filenames:
        (line_count, size) = File_Splitter.wc(filename)
        total_count = total_count + line_count
        total_size = total_size + size
        print("%i\t%i\t%s" % (line_count, size, filename))
    if len(args.filenames) > 1:

        print("%i\t%i\ttotal" % (total_count, total_size))
コード例 #14
0
def split_file_main(*argv):
    usage_message = '''
    
Split a text file into seperate pieces. if you specify 
autosplit then the program will use the first ten lines 
to calcuate an average line size and use that to 
determine the rough number of splits.

if you use **--splitsize** then the file will be split 
using **--splitsize** chunks until it is consumed.
'''

    parser = argparse.ArgumentParser(usage=usage_message)

    parser.add_argument('-v", ''--version', action='version', version='%(prog)s ' + __VERSION__)
    parser.add_argument("--autosplit", type=int,
                        help="split file based on loooking at the first ten lines and overall file size [default : %(default)s]")
    parser.add_argument('--hasheader', default=False, action="store_true",
                        help="Ignore header when calculating splits, don't include header in output")
    parser.add_argument('--usefieldfile', type=str,
                        help="Use this field file and copy to match split filenames")
    parser.add_argument('--generatefieldfile', default=False, action="store_true",
                        help="Generate a fieldfile for each input file")
    parser.add_argument('--delimiter', default=",", help="Delimiter for fields[default : %(default)s] ")
    parser.add_argument("--splitsize", type=int, help="Split file into chunks of this size")
    parser.add_argument('--verbose', default=False, action="store_true",
                        help="Print out what is happening")
    parser.add_argument("filenames", nargs="*", help='list of files')
    args = parser.parse_args(*argv)

    if len(args.filenames) == 0:
        print("No input file specified to split")
        sys.exit(0)

    files = []

    for i in args.filenames:

        if not os.path.isfile(i):
            print( "No such input file:'{}'".format(i))
            continue

        splitter = File_Splitter(i, args.hasheader)

        if args.autosplit:
            if args.verbose:
                print("Autosplitting: '{}' into approximately {} parts".format(i, args.autosplit))
            for newfile in splitter.autosplit(args.autosplit):
                files.append(newfile)
        else:
            if args.verbose:
                print("Splitting '%s' using %i splitsize" % (args.filenames[0], args.splitsize))
            for newfile in splitter.splitfile(args.splitsize):
                files.append(newfile)

        # print( "Split '%s' into %i parts"  % ( args.filenames[ 0 ], len( files )))

    count = 1
    total_size = 0
    total_lines = 0
    results = list(files)
    for (i, lines) in results:
        size = os.path.getsize(i)
        total_size = total_size + size
        total_lines = total_lines + lines
        if args.verbose:
            print("{:4}. '{:20}'. Lines : {:6}, Size: {:10}".format(count, i, lines, size))

        count = count + 1
    if len(files) > 1 :
        if args.verbose:
            print("{} {:16} {:17}".format( " " * (len(i) + 7), total_lines, total_size))

    if files and (total_size != splitter.no_header_size()):
        raise ValueError("Filesize of original and pieces does not match: total_size: %i, no header split_size: %i" % (
        total_size, splitter.no_header_size()))

    return results
コード例 #15
0
 def test_get_average_line_size(self):
     self.assertEqual(
         10,
         File_Splitter("data/tenlines.txt").get_average_line_size())
コード例 #16
0
def multi_import(*argv):
    """
.. function:: mutlti_import ( *argv )

   Import CSV files using multiprocessing

   :param argv: list of command lines

   """

    usage_message = '''
    
    A master script to manage uploading of a single data file as multiple input files. Multi-import
    will optionally split a single file (specified by the --single argument) or optionally upload an
    already split list of files passed in on the command line.
    Each file is uplaoded by a separate pymongoimport subprocess. 
    '''

    parser = argparse.ArgumentParser(usage=usage_message)
    parser = add_standard_args(parser)
    parser.add_argument(
        "--autosplit",
        type=int,
        help=
        "split file based on loooking at the first ten lines and overall file size [default : %(default)s]"
    )
    parser.add_argument(
        "--splitsize",
        type=int,
        help="Split file into chunks of this size [default : %(default)s]")

    args = parser.parse_args(*argv)

    log = Logger("multiimport").log()

    Logger.add_file_handler("multiimport")
    Logger.add_stream_handler("multiimport")

    child_args = sys.argv[1:]
    children = OrderedDict()

    print(args.filenames)
    if len(args.filenames) == 0:
        log.info("no input file to split")
        sys.exit(0)

    if args.autosplit or args.splitsize:
        if len(args.filenames) > 1:
            log.warn(
                "More than one input file specified ( '%s' ) only splitting the first file:'%s'",
                " ".join(args.filenames), args.filenames[0])
        if args.autosplit:
            child_args = strip_arg(child_args, "--autosplit", True)
        if args.splitsize:
            child_args = strip_arg(child_args, "--splitsize", True)

        splitter = File_Splitter(args.filenames[0], args.hasheader)

    for i in args.filenames:  # get rid of old filenames
        child_args = strip_arg(child_args, i, False)

    if args.autosplit:
        log.info("Autosplitting file: '%s' into (approx) %i chunks",
                 args.filenames[0], args.autosplit)
        files = splitter.autosplit(args.autosplit)
    elif args.splitsize > 0:
        log.info("Splitting file: '%s' into %i line chunks", args.filenames[0],
                 args.splitsize)
        files = splitter.split_file(args.splitsize)
    else:
        files = []
        for i in args.filenames:
            files.append((i, os.path.getsize(i)))

    if args.restart:
        log.info("Ignoring --drop overridden by --restart")
    elif args.drop:
        client = pymongo.MongoClient(args.host)
        log.info("Dropping database : %s", args.database)
        client.drop_database(args.database)
        child_args = strip_arg(child_args, args.drop)

    start = time.time()

    process_count = 0
    try:
        for filename in files:
            process_count = process_count + 1
            proc_name = filename[0]
            # need to turn args to Process into a tuple )
            new_args = copy.deepcopy(child_args)
            #new_args.extend( [ "--logname", filename[0], filename[0] ] )
            new_args.extend([filename[0]])
            proc = Process(target=mongo_import,
                           name=proc_name,
                           args=(new_args, ))
            log.info("Processing '%s'", filename[0])
            proc.daemon = True
            children[proc_name] = {"process": proc}
            log.info("starting sub process: %s", proc_name)
            children[proc_name]["start"] = time.time()
            proc.start()

        for i in children.keys():
            log.info("Waiting for process: '%s' to complete", i)
            children[i]["process"].join()
            children[i]["end"] = time.time()
            log.info("elapsed time for process %s : %f", i,
                     children[i]["end"] - children[i]["start"])

    except KeyboardInterrupt:
        for i in children.keys():
            log.info("terminating process: '%s'", i)
            children[i]["process"].terminate()

    finish = time.time()

    log.info("Total elapsed time:%f" % (finish - start))
コード例 #17
0
    def _auto_split_helper(self, filename, lines, split_count, has_header=False, dos_adjust=False):

        splitter = File_Splitter(filename, has_header=has_header)
        count = 0
        part_total_size = 0
        part_total_count = 0
        total_line_count = LineCounter(filename).line_count()
        self.assertEqual( total_line_count, lines)
        for (part_name, line_count) in splitter.autosplit(split_count):
            splitter_part = File_Splitter(part_name)
            part_count = LineCounter(part_name).line_count()
            self.assertGreater(part_count, 0)
            self.assertEqual(part_count, line_count)
            part_total_count = part_total_count + part_count
            part_total_size = part_total_size + splitter_part.size()
            os.unlink( part_name )


        lc = LineCounter(filename)

        if has_header:
            self.assertEqual(part_total_count, lines - 1)
            if splitter.file_type() is FileType.DOS:
                self.assertEqual(part_total_size, splitter.size() - lc.line_count() - len( splitter.header_line()) +1)
            else:
                self.assertEqual(part_total_size, splitter.size() - len( splitter.header_line()))
        else:
            self.assertEqual(part_total_count, lines)
            if splitter.file_type() is FileType.DOS:
                self.assertEqual(part_total_size, splitter.size() - lc.line_count())
            else:
                self.assertEqual(part_total_size, splitter.size())