def splitGeneExpressionCSV(GENE_NAME, nprocs): filename = getGeneFileName(GENE_NAME) ave, res = divmod(countLinesCSV(filename), int(nprocs)) print 'Splitting file %s ' % filename csv_splitter.split(filehandler=open(filename), output_name_template=GENE_NAME + '_part_%s.csv', output_path=WORKING_DIR, row_limit=ave)
def split_table_files(self, split_file): if os.path.isdir("../data_store/"): shutil.rmtree("../data_store/") os.mkdir("../data_store/") for table_def in table_data.table_data.values(): print("Creating " + table_def["name"]) + " data store files..." if not os.path.isdir("../data_store/" + table_def["name"]): os.mkdir("../data_store/" + table_def["name"]) if split_file: csv_splitter.split( open("../data/" + table_def["data_loc"], "r"), ",", 100, table_def["name"] + "_%s.csv", "../data_store/" + table_def["name"], False) else: shutil.copy2("../data/" + table_def["data_loc"], "../data_store/" + table_def["name"]) table_def[ "data_store"] = "../data_store/" + table_def["name"] + "/" table_def["data_files"] = os.listdir("../data_store/" + table_def["name"] + "/")
import csv_splitter csv_splitter.split( open( '/home/josh/python/SNLP/yelp_dataset_challenge_round9/yelp_academic_dataset_review.csv', 'r'))
#step 1 - setup (write the headers) and close the files readFile = open('../datafiles/SFPermitData/Building_Permits.csv', 'r') reader = csv.DictReader(readFile) newFilePath = '../datafiles/SFPermitData/Building_Permits_Extended.csv' if os.path.isfile(newFilePath): os.remove(newFilePath) #prevents us from re-editing same file writeFile = open(newFilePath, 'w') headerWriter = csv.writer(writeFile, lineterminator='\n') headerWriter.writerow(columnHeaders) readFile.close() writeFile.close() #after initial setup, split the files csv_splitter.split(open('../datafiles/SFPermitData/Building_Permits.csv', 'r'), output_path='../datafiles/SFPermitData/Split/') #get housing data for all the split files pool.map(get_housing_data, [f for f in os.listdir('../datafiles/SFPermitData/Split/')]) #now, write them back to the original writeFile = open(newFilePath, 'w') writer = csv.DictWriter(writeFile, lineterminator='\n', fieldnames=columnHeaders) for fileName in os.listdir('../datafiles/SFPermitData/SplitExtended/'): readFile = open('../datafiles/SFPermitData/SplitExtended/' + fileName) reader = csv.DictReader(readFile) for row in reader: toWrite = {}
def main(path): filePath = path csv_splitter.split(open(filePath, 'r'), ',', 300000, 'youtube_%s.csv', './output/', True)
def split(args): file_to_split = args.file with open(file_to_split, 'r') as f: csv_splitter.split(f); return split
conn.cursor().execute('USE ROLE ACCOUNTADMIN') sql = "remove @DDB_STG01/customer pattern ='.*.csv.gz'" conn.cursor().execute(sql) conn.close() # -- <) =============================================================================== #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # M A I N F L O W #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ log_file_setup() connection_parameters = args_to_properties(sys.argv) # Split LARGE files csv_splitter.split(connection_parameters['largefile']) DATABASE = connection_parameters["database"] SCHEMA = connection_parameters["schema"] STAGE = connection_parameters["stage"] FILEFORMAT = connection_parameters["fileformat"] LARGEFILE = connection_parameters["largefile"] #======================================================================================== # STEP 1 - Move split files to the Stage Location in Snowflake # Define the list of variables which determine the data that will be loaded #======================================================================================== splittedFIles = csv_file = 'C:/Users/north/OneDrive/Documents/Snowflake/SampleData/SplitFIleFdr/*.csv' variablesList = [{ 'sourceLocation': f'{splittedFIles}', 'destinationTable': f'{STAGE}/customer/'