def _auto_split_helper(self, filename, split_count, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header=has_header) count = 0 part_total_size = 0 part_total_count = 0 total_line_count = splitter.no_of_lines(include_header=has_header) for (part_name, line_count) in splitter.autosplit(split_count): splitter_part = File_Splitter(part_name) part_count = splitter_part.no_of_lines() self.assertGreater(part_count, 0) self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count part_total_size = part_total_size + splitter_part.size() os.unlink(part_name) self.assertEqual(part_total_count, splitter.no_of_lines(include_header=not has_header)) self.assertEqual( part_total_size, splitter.size(include_header=False, dos_adjust=dos_adjust))
def test_count_lines(self): self.assertEqual(3, File_Splitter("data/threelines.txt").no_of_lines()) self.assertEqual(0, File_Splitter("data/emptyfile.txt").no_of_lines()) self.assertEqual(4, File_Splitter("data/fourlines.txt").no_of_lines()) self.assertEqual(5, File_Splitter("data/inventory.csv").no_of_lines()) self.assertEqual( 4, File_Splitter("data/inventory.csv").no_of_lines( include_header=False))
def test_copy_file(self): splitter = File_Splitter(f("data/AandE_Data_2011-04-10.csv"), has_header=True) self.assertEqual(splitter.file_type(), FileType.DOS) (_, total_lines)=splitter.copy_file(f("data/AandE_Data_2011-04-10.csv") + ".1", ignore_header=True) # # we subtract 1 char for each line to account for dos \r\n. # We subtract the header line # because we have already subtracted all the lines in the original including the header line # (which has a \r\n) we add one back to account for this extra char. # self.assertEqual(os.path.getsize(f("data/AandE_Data_2011-04-10.csv.1")), splitter.size() - splitter.line_count() - len( splitter.header_line()) +1)
def test_generate_fieldfile(self): fc_filename = FieldConfig.generate_field_file("data/inventory.csv", ext="testff") self.assertTrue(os.path.isfile("data/inventory.testff")) fc = FieldConfig(fc_filename, hasheader=True) start_count = self._col.count() writer = File_Writer(self._col, fc) writer.insert_file("data/inventory.csv") line_count = File_Splitter("data/inventory.csv").count_lines() self.assertEqual(self._col.count() - start_count, line_count - 1) # header must be subtracted os.unlink("data/inventory.testff") with open("data/inventory.csv", "r") as f: if fc.hasheader(): _ = f.readline() reader = fc.get_dict_reader(f) fields = fc.fields() for row in reader: # print( row ) for f in fields: row[f] = fc.type_convert( row[f], fc.typeData(f)) # remember we type convert fields doc = self._col.find_one(row) self.assertTrue(doc)
def test_A_and_E_data(self): start_count = self._col.count() fp = FileProcessor(self._col, ',', onerror="ignore") fp.processOneFile(input_filename="data/AandE_Data_2011-04-10.csv", hasheader=True) lines = File_Splitter("data/AandE_Data_2011-04-10.csv").count_lines() self.assertEqual(lines, self._col.count() - start_count + 1) self.assertTrue(self._col.find_one({"Code": "RA4"}))
def test_mot_data(self): start_count = self._col.count() fp = FileProcessor(self._col, '|') fp.processOneFile("data/10k.txt") lines = File_Splitter("data/10k.txt").count_lines() self.assertEqual(lines, self._col.count() - start_count) self.assertTrue(self._col.find_one({"TestID": 114624}))
def _split_helper(self, filename, split_size, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header) count = 0 part_total_size = 0 part_total_count = 0 for (part_name, line_count) in splitter.splitfile(split_size): splitter_part = File_Splitter( part_name) part_count = LineCounter(part_name).line_count() self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count part_total_size = part_total_size + splitter_part.size() os.unlink(part_name) lc = LineCounter(filename) if has_header: self.assertEqual(part_total_count, lc.line_count() - 1) else: self.assertEqual(part_total_count, lc.line_count()) if dos_adjust: self.assertEqual(part_total_size, splitter.size() - lc.line_count() - len( splitter.header_line()) +1) else: self.assertEqual(part_total_size, splitter.size() - len( splitter.header_line()))
def _compare_input_output(self, input_filename, output_filenames): original_count = 0 file_piece_count = 0 with open(input_filename, "r") as original_file: for filename in File_Splitter.shim_names(output_filenames): with open(filename, "r") as file_piece: for line in file_piece: left = original_file.readline() original_count = original_count + 1 right = line file_piece_count = file_piece_count + 1 self.assertEqual(left, right) os.unlink(filename)
def test_property_prices(self): start_count = self._col.count() fp = FileProcessor(self._col, ',') try: fp.processOneFile("data/uk_property_prices.csv") except pymongo.errors.BulkWriteError as e: print(e) raise lines = File_Splitter("data/uk_property_prices.csv").count_lines() self.assertEqual(lines, self._col.count() - start_count) self.assertTrue(self._col.find_one({"Postcode": "NG10 5NN"}))
def _compare_input_output(self, input_filename, output_filenames, has_header=False): original_count = 0 file_piece_count = 0 with open(input_filename, "r") as original_file: if has_header: _ = original_file.readline() for filename in File_Splitter.shim_names(output_filenames): with open(filename, "r") as file_piece: for line in file_piece: left = original_file.readline() original_count = original_count + 1 right = line file_piece_count = file_piece_count + 1 self.assertEqual(left, right) os.unlink(filename)
def test_copy_file(self): splitter = File_Splitter("data/AandE_Data_2011-04-10.csv", has_header=True) self.assertEqual(splitter.file_type(), File_Type.DOS) (_, total_lines) = splitter.copy_file( "data/AandE_Data_2011-04-10.csv" + ".1", ignore_header=True) self.assertEqual( File_Splitter("data/AandE_Data_2011-04-10.csv.1").size(), splitter.size(include_header=False, dos_adjust=True))
def split_file(*argv): usage_message = ''' Split a text file into seperate pieces. if you specify autosplit then the program will use the first ten lines to calcuate an average line size and use that to determine the rough number of splits. if you use **--splitsize** then the file will be split using **--splitsize** chunks until it is consumed. ''' parser = argparse.ArgumentParser(usage=usage_message) parser.add_argument('-v", ''--version', action='version', version='%(prog)s ' + __VERSION__) parser.add_argument("--autosplit", type=int, help="split file based on loooking at the first ten lines and overall file size [default : %(default)s]") parser.add_argument('--hasheader', default=False, action="store_true", help="Use header line for column names [default: %(default)s]") parser.add_argument("--splitsize", type=int, help="Split file into chunks of this size") parser.add_argument("filenames", nargs="*", help='list of files') args = parser.parse_args(*argv) if len(args.filenames) == 0: print("No input file specified to split") sys.exit(0) elif len(args.filenames) > 1: print("More than one input file specified ( %s ) only splitting the first file:'%s'" % (" ".join(args.filenames), args.filenames[0])) splitter = File_Splitter(args.filenames[0], args.hasheader) if args.autosplit: print("Autosplitting: '%s'" % args.filenames[0]) files = splitter.autosplit(args.autosplit) else: print("Splitting '%s' using %i splitsize" % (args.filenames[0], args.splitsize)) files = splitter.split_file(args.splitsize) # print( "Split '%s' into %i parts" % ( args.filenames[ 0 ], len( files ))) count = 1 total_size = 0 results = list(files) for (i, lines) in results: size = os.path.getsize(i) total_size = total_size + size print("%i. '%s'. Lines : %i, Size: %i" % (count, i, lines, size)) count = count + 1 if total_size != splitter.no_header_size(): raise ValueError("Filesize of original and pieces does not match: total_size: %i, no header split_size: %i" % ( total_size, splitter.no_header_size())) return results
def pwc(*argv): parser = argparse.ArgumentParser() parser.add_argument("filenames", nargs="*", help='list of files') args = parser.parse_args(*argv) line_count = 0 total_count = 0 total_size = 0 size = 0 if args.filenames: print("lines\tbytes\tfilename") for filename in args.filenames: (line_count, size) = File_Splitter.wc(filename) total_count = total_count + line_count total_size = total_size + size print("%i\t%i\t%s" % (line_count, size, filename)) if len(args.filenames) > 1: print("%i\t%i\ttotal" % (total_count, total_size))
def split_file_main(*argv): usage_message = ''' Split a text file into seperate pieces. if you specify autosplit then the program will use the first ten lines to calcuate an average line size and use that to determine the rough number of splits. if you use **--splitsize** then the file will be split using **--splitsize** chunks until it is consumed. ''' parser = argparse.ArgumentParser(usage=usage_message) parser.add_argument('-v", ''--version', action='version', version='%(prog)s ' + __VERSION__) parser.add_argument("--autosplit", type=int, help="split file based on loooking at the first ten lines and overall file size [default : %(default)s]") parser.add_argument('--hasheader', default=False, action="store_true", help="Ignore header when calculating splits, don't include header in output") parser.add_argument('--usefieldfile', type=str, help="Use this field file and copy to match split filenames") parser.add_argument('--generatefieldfile', default=False, action="store_true", help="Generate a fieldfile for each input file") parser.add_argument('--delimiter', default=",", help="Delimiter for fields[default : %(default)s] ") parser.add_argument("--splitsize", type=int, help="Split file into chunks of this size") parser.add_argument('--verbose', default=False, action="store_true", help="Print out what is happening") parser.add_argument("filenames", nargs="*", help='list of files') args = parser.parse_args(*argv) if len(args.filenames) == 0: print("No input file specified to split") sys.exit(0) files = [] for i in args.filenames: if not os.path.isfile(i): print( "No such input file:'{}'".format(i)) continue splitter = File_Splitter(i, args.hasheader) if args.autosplit: if args.verbose: print("Autosplitting: '{}' into approximately {} parts".format(i, args.autosplit)) for newfile in splitter.autosplit(args.autosplit): files.append(newfile) else: if args.verbose: print("Splitting '%s' using %i splitsize" % (args.filenames[0], args.splitsize)) for newfile in splitter.splitfile(args.splitsize): files.append(newfile) # print( "Split '%s' into %i parts" % ( args.filenames[ 0 ], len( files ))) count = 1 total_size = 0 total_lines = 0 results = list(files) for (i, lines) in results: size = os.path.getsize(i) total_size = total_size + size total_lines = total_lines + lines if args.verbose: print("{:4}. '{:20}'. Lines : {:6}, Size: {:10}".format(count, i, lines, size)) count = count + 1 if len(files) > 1 : if args.verbose: print("{} {:16} {:17}".format( " " * (len(i) + 7), total_lines, total_size)) if files and (total_size != splitter.no_header_size()): raise ValueError("Filesize of original and pieces does not match: total_size: %i, no header split_size: %i" % ( total_size, splitter.no_header_size())) return results
def test_get_average_line_size(self): self.assertEqual( 10, File_Splitter("data/tenlines.txt").get_average_line_size())
def multi_import(*argv): """ .. function:: mutlti_import ( *argv ) Import CSV files using multiprocessing :param argv: list of command lines """ usage_message = ''' A master script to manage uploading of a single data file as multiple input files. Multi-import will optionally split a single file (specified by the --single argument) or optionally upload an already split list of files passed in on the command line. Each file is uplaoded by a separate pymongoimport subprocess. ''' parser = argparse.ArgumentParser(usage=usage_message) parser = add_standard_args(parser) parser.add_argument( "--autosplit", type=int, help= "split file based on loooking at the first ten lines and overall file size [default : %(default)s]" ) parser.add_argument( "--splitsize", type=int, help="Split file into chunks of this size [default : %(default)s]") args = parser.parse_args(*argv) log = Logger("multiimport").log() Logger.add_file_handler("multiimport") Logger.add_stream_handler("multiimport") child_args = sys.argv[1:] children = OrderedDict() print(args.filenames) if len(args.filenames) == 0: log.info("no input file to split") sys.exit(0) if args.autosplit or args.splitsize: if len(args.filenames) > 1: log.warn( "More than one input file specified ( '%s' ) only splitting the first file:'%s'", " ".join(args.filenames), args.filenames[0]) if args.autosplit: child_args = strip_arg(child_args, "--autosplit", True) if args.splitsize: child_args = strip_arg(child_args, "--splitsize", True) splitter = File_Splitter(args.filenames[0], args.hasheader) for i in args.filenames: # get rid of old filenames child_args = strip_arg(child_args, i, False) if args.autosplit: log.info("Autosplitting file: '%s' into (approx) %i chunks", args.filenames[0], args.autosplit) files = splitter.autosplit(args.autosplit) elif args.splitsize > 0: log.info("Splitting file: '%s' into %i line chunks", args.filenames[0], args.splitsize) files = splitter.split_file(args.splitsize) else: files = [] for i in args.filenames: files.append((i, os.path.getsize(i))) if args.restart: log.info("Ignoring --drop overridden by --restart") elif args.drop: client = pymongo.MongoClient(args.host) log.info("Dropping database : %s", args.database) client.drop_database(args.database) child_args = strip_arg(child_args, args.drop) start = time.time() process_count = 0 try: for filename in files: process_count = process_count + 1 proc_name = filename[0] # need to turn args to Process into a tuple ) new_args = copy.deepcopy(child_args) #new_args.extend( [ "--logname", filename[0], filename[0] ] ) new_args.extend([filename[0]]) proc = Process(target=mongo_import, name=proc_name, args=(new_args, )) log.info("Processing '%s'", filename[0]) proc.daemon = True children[proc_name] = {"process": proc} log.info("starting sub process: %s", proc_name) children[proc_name]["start"] = time.time() proc.start() for i in children.keys(): log.info("Waiting for process: '%s' to complete", i) children[i]["process"].join() children[i]["end"] = time.time() log.info("elapsed time for process %s : %f", i, children[i]["end"] - children[i]["start"]) except KeyboardInterrupt: for i in children.keys(): log.info("terminating process: '%s'", i) children[i]["process"].terminate() finish = time.time() log.info("Total elapsed time:%f" % (finish - start))
def _auto_split_helper(self, filename, lines, split_count, has_header=False, dos_adjust=False): splitter = File_Splitter(filename, has_header=has_header) count = 0 part_total_size = 0 part_total_count = 0 total_line_count = LineCounter(filename).line_count() self.assertEqual( total_line_count, lines) for (part_name, line_count) in splitter.autosplit(split_count): splitter_part = File_Splitter(part_name) part_count = LineCounter(part_name).line_count() self.assertGreater(part_count, 0) self.assertEqual(part_count, line_count) part_total_count = part_total_count + part_count part_total_size = part_total_size + splitter_part.size() os.unlink( part_name ) lc = LineCounter(filename) if has_header: self.assertEqual(part_total_count, lines - 1) if splitter.file_type() is FileType.DOS: self.assertEqual(part_total_size, splitter.size() - lc.line_count() - len( splitter.header_line()) +1) else: self.assertEqual(part_total_size, splitter.size() - len( splitter.header_line())) else: self.assertEqual(part_total_count, lines) if splitter.file_type() is FileType.DOS: self.assertEqual(part_total_size, splitter.size() - lc.line_count()) else: self.assertEqual(part_total_size, splitter.size())