Esempio n. 1
0
def splitGeneExpressionCSV(GENE_NAME, nprocs):
    filename = getGeneFileName(GENE_NAME)
    ave, res = divmod(countLinesCSV(filename), int(nprocs))
    print 'Splitting file %s ' % filename
    csv_splitter.split(filehandler=open(filename),
                       output_name_template=GENE_NAME + '_part_%s.csv',
                       output_path=WORKING_DIR,
                       row_limit=ave)
Esempio n. 2
0
 def split_table_files(self, split_file):
     if os.path.isdir("../data_store/"):
         shutil.rmtree("../data_store/")
     os.mkdir("../data_store/")
     for table_def in table_data.table_data.values():
         print("Creating " + table_def["name"]) + " data store files..."
         if not os.path.isdir("../data_store/" + table_def["name"]):
             os.mkdir("../data_store/" + table_def["name"])
         if split_file:
             csv_splitter.split(
                 open("../data/" + table_def["data_loc"],
                      "r"), ",", 100, table_def["name"] + "_%s.csv",
                 "../data_store/" + table_def["name"], False)
         else:
             shutil.copy2("../data/" + table_def["data_loc"],
                          "../data_store/" + table_def["name"])
         table_def[
             "data_store"] = "../data_store/" + table_def["name"] + "/"
         table_def["data_files"] = os.listdir("../data_store/" +
                                              table_def["name"] + "/")
Esempio n. 3
0
import csv_splitter
csv_splitter.split(
    open(
        '/home/josh/python/SNLP/yelp_dataset_challenge_round9/yelp_academic_dataset_review.csv',
        'r'))
Esempio n. 4
0
    #step 1 - setup (write the headers) and close the files
    readFile = open('../datafiles/SFPermitData/Building_Permits.csv', 'r')
    reader = csv.DictReader(readFile)
    newFilePath = '../datafiles/SFPermitData/Building_Permits_Extended.csv'
    if os.path.isfile(newFilePath):
        os.remove(newFilePath)  #prevents us from re-editing same file
    writeFile = open(newFilePath, 'w')
    headerWriter = csv.writer(writeFile, lineterminator='\n')
    headerWriter.writerow(columnHeaders)

    readFile.close()
    writeFile.close()

    #after initial setup, split the files
    csv_splitter.split(open('../datafiles/SFPermitData/Building_Permits.csv',
                            'r'),
                       output_path='../datafiles/SFPermitData/Split/')
    #get housing data for all the split files
    pool.map(get_housing_data,
             [f for f in os.listdir('../datafiles/SFPermitData/Split/')])

    #now, write them back to the original
    writeFile = open(newFilePath, 'w')
    writer = csv.DictWriter(writeFile,
                            lineterminator='\n',
                            fieldnames=columnHeaders)
    for fileName in os.listdir('../datafiles/SFPermitData/SplitExtended/'):
        readFile = open('../datafiles/SFPermitData/SplitExtended/' + fileName)
        reader = csv.DictReader(readFile)
        for row in reader:
            toWrite = {}
Esempio n. 5
0
def main(path):
    filePath = path
    csv_splitter.split(open(filePath, 'r'), ',', 300000, 'youtube_%s.csv',
                       './output/', True)
Esempio n. 6
0
def split(args):
    file_to_split = args.file
    with open(file_to_split, 'r') as f:
        csv_splitter.split(f);

    return split
Esempio n. 7
0
    conn.cursor().execute('USE ROLE ACCOUNTADMIN')

    sql = "remove @DDB_STG01/customer pattern ='.*.csv.gz'"
    conn.cursor().execute(sql)
    conn.close()


# -- <) ===============================================================================
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# M A I N     F L O W
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
log_file_setup()
connection_parameters = args_to_properties(sys.argv)

# Split LARGE files
csv_splitter.split(connection_parameters['largefile'])

DATABASE = connection_parameters["database"]
SCHEMA = connection_parameters["schema"]
STAGE = connection_parameters["stage"]
FILEFORMAT = connection_parameters["fileformat"]
LARGEFILE = connection_parameters["largefile"]

#========================================================================================
# STEP 1 - Move split files to the Stage Location in Snowflake
# Define the list of variables which determine the data that will be loaded
#========================================================================================
splittedFIles = csv_file = 'C:/Users/north/OneDrive/Documents/Snowflake/SampleData/SplitFIleFdr/*.csv'
variablesList = [{
    'sourceLocation': f'{splittedFIles}',
    'destinationTable': f'{STAGE}/customer/'