Exemple #1
0
 def _split_file(self, path):
     pth = path
     pth_dirname = pth.parent
     outdir = pth_dirname.joinpath(pth.stem)
     outdir.mkdir(parents=True, exist_ok=True)
     fs = FileSplit(file=pth, splitsize=1_100_000, output_dir=outdir)
     fs.split()
Exemple #2
0
def main(filename):
    pth = Path(filename).resolve()
    assert pth.exists(), pth
    pth_dirname = pth.parent
    outdir = pth.joinpath(pth.stem)
    outdir_relative = outdir.relative_to(pth)
    outdir_relative.mkdir(parents=True, exist_ok=True)
    fs = FileSplit(file=pth, splitsize=1_100_000, output_dir=outdir_relative)
    fs.split()
Exemple #3
0
def sharding(file_name):
    try:
        os.makedirs('sample_sharded')
    except:
        pass
    filesize = str(os.path.getsize(file_name)).zfill(16)
    fs = FileSplit(file=file_name,
                   splitsize=int(filesize) / 4,
                   output_dir="sample_sharded/")
    fs.split()
Exemple #4
0
def write_func(outputs, filenames):
    for filename in filenames:
        fndir = filename.parent
        fndir.mkdir(parents=True, exist_ok=True)
    for output, filename in zip(outputs, filenames):
        with open(filename, 'w') as f:
            f.write(output.getvalue())
    for filename in filenames:
        dirname = filename.parent
        new_pth_cmpts = dirname.relative_to(self.basedir)
        outdir = self.basedir.joinpath(new_pth_cmpts)
        fs = FileSplit(file=filename, splitsize=1_100_000, output_dir=outdir)
        fs.split()
Exemple #5
0
def upload_to_hdfs(input_dir, output_dir, chunk_size):
    # locate files in directory
    files = [
        os.path.abspath("{}/{}".format(input_dir, f))
        for f in listdir(input_dir) if isfile(join(input_dir, f))
    ]
    tmp_dir = "{}/tmp".format(input_dir)

    # setup temp dir
    if os.path.isdir(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.mkdir(tmp_dir)

    # split files into 128mb chunks
    for f in files:
        fs = FileSplit(file=f,
                       splitsize=(chunk_size) * 1e6,
                       output_dir=tmp_dir)
        fs.split(callback=split_callback)

    # upload to hdfs
    hdfs_client = InsecureClient("http://{}:9870".format(
        settings.HDFS_HOST_VALUE),
                                 user=settings.HDFS_USER_VALUE)

    # delete existing output dir
    if hdfs_client.content(output_dir, strict=False) != None:
        hdfs_client.delete(output_dir, recursive=True)

    # upload files to tmp dir
    remote_path = hdfs_client.upload(hdfs_path="/tmp",
                                     local_path=tmp_dir,
                                     n_threads=-1,
                                     overwrite=True)

    # rename to output_dir
    hdfs_client.rename("/tmp", output_dir)

    print(
        "{} files uploaded to hdfs host '{}{}'  ({} file chunks total)".format(
            len(files),
            settings.HDFS_HOST_VALUE,
            output_dir,
            len(split_files),
        ))
    # delete temp files
    shutil.rmtree(tmp_dir)

    return hdfs_file_paths
Exemple #6
0
def split_file(file_to_split, number_of_chunks):
    file_size = os.path.getsize(file_to_split)
    split_size = file_size/(number_of_chunks-1)
    splitted_file_path = os.path.join(settings.MEDIA_ROOT, "splitted_file")
    # os.makedirs(splitted_file_path)
    fs = FileSplit(file_to_split, split_size, splitted_file_path)

    try:
        fs.split()
        os.remove(file_to_split)
    except e as Exception:
        print(e)
        return False
    else:
        return splitted_file_path
def main():
    dev_limits = config_get_dev_limits()
    token = api_get_token()

    pair_index = 0
    for dev_upload_path in config_get_dev_paths()["upload_pairs"]:
        file_id = api_get_file_id(token, None, pair_index)
        upload_url = api_create_upload_session(token, pair_index)
        total_size = fs_get_upload_size(pair_index)

        if total_size > dev_limits["upload_partition_limit"]:
            limit = dev_limits["upload_partition_limit"]
            print_message(
                "Greater than " + str(limit / 1024 / 1024) +
                " MB - need to split into chunks", "UPLOAD", "verbose")

            with tempfile.TemporaryDirectory(
                    dir=fs_get_parent_dir()) as tmpdirname:
                print_message("Created temporary directory: " + tmpdirname,
                              "UPLOAD", "verbose")
                fs = FileSplit(file=dev_upload_path,
                               splitsize=limit,
                               output_dir=tmpdirname)
                fs.split()
                chunks = get_chunks(tmpdirname, pair_index)
                start_byte = 0
                for chunk_name in chunks:
                    chunk_path = fs_get_chunk_full_path(tmpdirname, chunk_name)
                    chunk_size = fs_get_chunk_size(chunk_path)
                    if chunk_size > limit:
                        raise RuntimeError(
                            "There was a problem partitioning the tar file")
                    with open(chunk_path, 'rb') as chunk:
                        payload = chunk.read()
                    api_upload_chunk(upload_url, start_byte,
                                     start_byte + chunk_size - 1, total_size,
                                     payload)
                    start_byte += chunk_size
        else:
            print_message("Uploading entire file in one chunk", "UPLOAD",
                          "verbose")
            with open(dev_upload_path, 'rb') as file:
                payload = file.read()
            api_upload_chunk(upload_url, 0, total_size - 1, total_size,
                             payload)

        maintain_size(token, file_id, pair_index)
        pair_index += 1
Exemple #8
0
def dispatchFile(fileName: str, nodeList: list):

    fs = FileSplit(
        file=fileName,
        splitsize=math.ceil(os.path.getsize(fileName) / len(nodeList)) + 10,
        output_dir='.')

    def func(filePath, size, count):
        print("Dispatching file: {0}, size: {1}, count: {2}".format(
            filePath, size, count))
        num = int(filePath.split('_')[-1].split('.')[0]) - 1

        send_file(nodeList[num], filePath, filePath.split('/')[-1])
        os.remove(filePath)

    fs.split(callback=func)
def split_file(root_path, split_size_mb, output_path):

    # Converts Mb to Bytes
    split_size_mb = split_size_mb * 1048576

    # Collect files from root path
    file_path = collect_files(root_path)

    for file in file_path:

        # Create Output Directory
        export_path = create_dir(file, output_path)

        # Reads File
        fs = FileSplit(file=file,
                       splitsize=split_size_mb,
                       output_dir=export_path)

        # Splits file
        fs.split(include_header=True, callback=split_file_stats)
Exemple #10
0
class DMFCSV:
    def __init__(self, database="", table_name="", file_data=[]):
        self.__file_name = "_".join([database, table_name])
        self.__data = file_data

    def write_csv(self):
        with open(".".join([self.__file_name, FILE_EXT]), 'w') as csvfile:
            self.__data_writer = csv.writer(csvfile)
            self.__data_writer.writerows(self.__data)
        return True

    def split_csv(self):
        self.__fs = FileSplit(file=".".join(
            [self.__file_name, FILE_EXT]), splitsize=SPLIT_SIZE)
        return self.__fs.split(include_header=True)
Exemple #11
0
def split(file):  # File split function
    fs = FileSystemStorage()
    filename = fs.save(file.name, file)
    source = os.path.join(MEDIA_ROOT, filename)
    dest = os.path.join(MEDIA_ROOT, 'temp/')
    chunks = (int)((file.size) / 3)
    fs = FileSplit(source, chunks, dest)
    fs.split()
    os.remove(source)
    fname = filename.split('.')
    dest1 = dest + fname[0] + '_3.' + fname[1]
    dest2 = dest + fname[0] + '_4.' + fname[1]
    if (os.path.exists(dest1)):
        with open(dest1, 'ab') as f:
            fl = open(dest2, 'rb').read()
            f.write(b'\n' + fl)
        os.remove(dest2)
Exemple #12
0
def split_csv(file_name='', file_ext='csv'):
    fs = FileSplit(file=".".join([file_name, file_ext]), splitsize=10)
    # fs = FileSplit(file=".".join([file_name, file_ext]), splitsize=1000000000)
    return fs.split(include_header=True)
from fsplit.filesplit import FileSplit

fs = FileSplit(file='path/to/file', splitsize=500000000, output_dir='/path/to/output directory/')

https://pypi.org/project/filesplit/
Exemple #14
0
def app_main(config):
    """
    Main loop of the program
    1. Split the input file into chunks
    2. Create a pool of worker processes and distribute the chunks between the processes
    3. Write the combined dataframe to a file
    """
    logger.info("in app_main")

    # initialize some variables that we need
    dns_data = None

    logger.info("configured action is {}".format(config.action))

    run_dir = os.path.join(g.DATA_DIR, g.DATASET_VER, g.RUN_NAME)
    os.makedirs(run_dir, exist_ok=True)

    if config.action == g.ACTION_WRANGLE_ONLY or config.action == g.ACTION_RUN_ALL:
        logger.info("running data wrangling operations...")

        # Step 0: read data needed by all worker threads
        file_path = os.path.join(g.RAW_DATA_DIR, g.DATASET_VER, g.CCTLD_FILE_NAME)
        ccTLD = pd.read_csv(file_path)
        logger.info("read country code top level domains...")
        logger.info(ccTLD.head())

        # step 1, split the file into chunks
        # we expect the file to be within the current project, soon this will be replaced
        # with download from a dataset repo if not present so the file is downloaded one time
        # and then available for use as long as the version number does not change (DATASET_VER)
        src_dir = os.path.dirname(os.path.abspath(__file__))
        file_path = os.path.join(src_dir, g.RAW_DATA_DIR, g.DATASET_VER, config.filename)
        output_dir = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER, g.SPLITS_DIR)

        p = Path(file_path)
        if p.exists() is False:
            logger.error("%s does not exist, existing %s", file_path, g.APP_NAME)
            sys.exit()
        else:
            logger.info("%s found, going to split it into %sMB sized chunks now", file_path, config.splitsizemb)
            os.makedirs(output_dir, exist_ok=True)
            split_size = g.MB*int(config.splitsizemb)

            # the split will create a malformed last line because it is size based and not line based
            # but that is ok..we have millions of lines so a few do not matterHOURLY_UNIQUE_SRC_IP_COUNT_PER_LOW_TFIDF_QUERY
            fs = FileSplit(file_path, split_size, output_dir)
            fs.split()

        logger.info("file split complete..moving to wrangling stage now")

        file_list = [os.path.join(output_dir, f) for f in os.listdir(output_dir)
                     if os.path.isfile(os.path.join(output_dir, f))]
        logger.info("file names for the splits are {}".format(file_list))

        # step 2: wrangle data
        dns_data, _, fqdn_stats, _, hourly_per_query_counts, _, _, _, _, _, hourly_unique_src_ip_counts = wrangle_data(file_list, ccTLD)
        logger.info("done wrangling data...")

        # step 3: get tfidf values for each domain so we can identify stop domains like stop words
        logger.info("going to do tfidf on domain names to figure out stop domains now..")
        fqdn_tfidf, fqdn_tfidf_subset = do_tfidf_transform_on_query_names()

        # step 3.1, create file with per hour query count most non-informative domains (from our perspective of
        # figuring out similarities between domain names)
        low_tfidf_domains = fqdn_tfidf_subset['fqdn']
        hourly_per_query_counts_low_tfidf = hourly_per_query_counts[hourly_per_query_counts['query'].isin(low_tfidf_domains)]
        file_path = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER,
                                 g.HOURLY_PER_QUERY_COUNT_LOW_TFIDF_TIMESERIES_FILE_NAME)
        hourly_per_query_counts_low_tfidf.to_csv(file_path, index=False)

        # step 3.2, create file with per hour unique src ip counts most non-informative domains
        # (from our perspective of figuring out similarities between domain names)
        hourly_unique_src_ip_counts_low_tfidf = hourly_unique_src_ip_counts[hourly_unique_src_ip_counts['query'].isin(low_tfidf_domains)]
        file_path = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER,
                                 g.HOURLY_UNIQUE_SRC_IP_COUNT_PER_LOW_TFIDF_QUERY)
        hourly_unique_src_ip_counts_low_tfidf.to_csv(file_path, index=False)

        # step 3.3, create file with per hour unique src ip counts for top 100 most frequently accessed domains
        hourly_unique_src_ip_counts_for_most_freq = hourly_unique_src_ip_counts[hourly_unique_src_ip_counts['query'].isin(fqdn_stats.tail(g.N_FOR_FQDNS_W_LOWEST_TFIDF)['query'])]
        file_path = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER,
                                 g.HOURLY_UNIQUE_SRC_IP_COUNT_MOST_FREQ_FQDNS)
        hourly_unique_src_ip_counts_for_most_freq.to_csv(file_path, index=False)

        # going to do some post processing on dns data
        # 1. remove the low tfidf domains i.e. treat them like stopwords in a sentence
        # 2. shorten the domain names to only keep the most significant part of it
        # 3. write to dnsdata file
        logger.info("going to post process dns data from previous step to remove stop domain names")
        dns_data = post_process_dns_data(dns_data, low_tfidf_domains, ccTLD)
        file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.WRANGLED_DATASET_FILE_NAME)
        dns_data.to_csv(file_path, index=False)
        logger.info("wrote the final dns dataset to {}".format(file_path))

    if config.action == g.ACTION_ANALYZE_ONLY or config.action == g.ACTION_RUN_ALL:
        logger.info("running analysis...")
        if dns_data is None:
            # read the dns dataset created as part of the analysis phase, either in this run
            # or the previous one
            file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.WRANGLED_DATASET_FILE_NAME)
            dns_data = pd.read_csv(file_path)

        # we have the data, either from the previous step or reading from the file
        logger.info(dns_data['query'].head())

        # read each line from the dataframe, split it to convert it into an array
        #  because word2vec needs an array of arrays (sentences)..
        # but before we do that we also need to typecast each
        # individual domain name in the "sentence" to a string because some domain names like
        # 169.254 make Python think this is a float..
        # queries_as_sentences = [[q for q in str(queries).split() if q.startswith("www")] for queries in dns_data['query']]
        queries_as_sentences = [str(queries).split() for queries in dns_data['query']]

        # write this list of lists to a file, we would use this as input to an LSTM model
        # to predict what domain name comes after a sequence of domains
        file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.FINAL_INPUT_TO_W2V_MODEL_FILE_NAME) 
        with open(file_path,"w") as f:
            wr = csv.writer(f)
            wr.writerows(queries_as_sentences)

        # logger.info([len(q) for q in queries_as_sentences])

        # run word2vec
        # log all the word2vec config to a file for record purposes
        file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.RUN_NAME, g.W2V_PARAMS_FILE_NAME)
        with open(file_path, "w") as w2v_params_file:
            w2v_parms = { 'run_name': g.RUN_NAME,
                          'num_sentences': len(queries_as_sentences),
                          'embedding_size': g.W2V_EMBEDDING_SIZE,
                          'window_size': g.W2V_WINDOW,
                          'min_count': g.W2V_MIN_COUNT,
                          'negative': g.W2V_NEGATIVE_SAMPLING,
                          'max_vocab_size': g.W2V_MAX_VOCAB_SIZE,
                          'sample': g.W2V_SAMPLE,
                          'ns_exponent': g.W2V_NS_EXPONENT,
                          'num_workers': g.W2V_WORKERS,
                          'sg': g.W2V_USE_SG,
                          'epochs': g.WV_EPOCHS,
                          'seed': g.WV_SEED }
            w2v_params_file.write(json.dumps(w2v_parms, indent=2))
        model_dns = Word2Vec(sentences=queries_as_sentences, size=g.W2V_EMBEDDING_SIZE, window=g.W2V_WINDOW,
                             min_count=g.W2V_MIN_COUNT, workers=g.W2V_WORKERS, sg=g.W2V_USE_SG, iter=g.WV_EPOCHS,
                             negative = g.W2V_NEGATIVE_SAMPLING, max_vocab_size=g.W2V_MAX_VOCAB_SIZE,
                             sample = g.W2V_SAMPLE, ns_exponent = g.W2V_NS_EXPONENT, 
                             seed=g.WV_SEED, compute_loss=True, callbacks=[EpochLogger()])


        file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.RUN_NAME,
                                 g.DNS_DATASET_FILE_NAME.split('.')[0] + g.MODEL_FILE_SUFFIX)
        logger.info("saving Dns2Vc model to {}".format(file_path))
        model_dns.save(file_path)

        logger.info("going to save dns vectors...")
        save_dns_vectors_for_tensorboard(model_dns)

        logger.info("going to run tests to predict similar domains...")
        run_dns2vec_tests(model_dns)

    logger.info("nothing more to do..")
Exemple #15
0
def run_import_file(
    account,
    user,
    password,
    database,
    warehouse,
    schema,
    schema_table,
    file_path,
    delimiter=",",
):
    row_count = 0

    db = Database(
        account,
        user,
        password,
        database,
        warehouse,
        schema,
    )
    try:
        # Uppercase table
        schema_table = schema_table.upper()

        # File dir and name
        path_parts = os.path.split(file_path)
        file_dir = path_parts[0]
        file_name = path_parts[1]

        # Tablename underscored
        table_name_text = file_name.replace('.', '_')

        # Fileformat
        try:
            table_format = table_name_text + "_format"
            create_format_sql = (
                "create or replace file format " + table_format +
                " type = 'CSV' field_delimiter = '" + delimiter
                #+ "' skip_header = 1; "
                + "'; ")
            logger.debug(create_format_sql)
            db.execute(create_format_sql)
        except:
            logger.error('Failed create fileformat: ' + file_path)

        # Stage
        try:
            table_stage = table_name_text + "_stage"
            create_stage_sql = ("create or replace stage " + table_stage +
                                " file_format = " + table_format + ";")
            logger.debug(create_stage_sql)
            db.execute(create_stage_sql)
        except:
            logger.error('Failed create stage: ' + file_path)

        # Split files

        fs = FileSplit(file=file_path, splitsize=50000000, output_dir=file_dir)
        fs.split()

        os.remove(file_path)

        # put
        try:
            files = file_path[:-4] + "*.csv"
            put_sql = ("PUT file://" + files + " @" + table_stage +
                       " auto_compress=true;")
            logger.debug(put_sql)
            db.execute(put_sql)
        except:
            logger.error('Failed PUT: ' + file_path)

        # copy
        try:
            copy_sql = ("COPY INTO " + schema_table + " FROM @" + table_stage +
                        " file_format = (format_name = " + table_format +
                        ") on_error = 'CONTINUE';")
            logger.debug(copy_sql)
            sfqid = db.execute(copy_sql).sfqid
            logger.debug("copy table success")

            logger.debug('Snowflake copy query id: ' + sfqid)
            sfqid = "'" + sfqid + "'"
        except:
            logger.error('Failed COPY: ' + file_path)

        try:
            qstring = 'SELECT * FROM TABLE(RESULT_SCAN({}))'
            load_result = db.execute(qstring.format(sfqid)).fetchall()

            for res in load_result:
                logger.debug(res)

            row_count = sum([row[3] for row in load_result])

            if len([row[1]
                    for row in load_result if row[1] == 'LOAD_FAILED']) > 0:
                logger.error('Load completed with errors')
        except:
            logger.error('Failed getting load results: ' + file_path)

        # remove stage
        drop_stage_sql = "DROP STAGE IF EXISTS " + table_stage
        logger.debug(drop_stage_sql)
        db.execute(drop_stage_sql)
        logger.debug("stage deleted")

        # remove fileformat
        drop_file_format_sql = "DROP FILE FORMAT IF EXISTS " + table_format
        logger.debug(drop_file_format_sql)
        db.execute(drop_file_format_sql)
        logger.debug("format deleted")

        logger.debug(str(row_count) + " records imported")

        return row_count

    except:
        logger.error("Failed importing file: " + file_path)
        return row_count
Exemple #16
0
from fsplit.filesplit import FileSplit

fs = FileSplit(file='./wiki.json', splitsize=10240000, output_dir='../data')
fs.split()
Exemple #17
0
def main():
    print("Wikipedia Wordlist Extractor")
    xmlfile_path = args.n
    pros = int(args.p)
    listname = args.f[:-4] + "-wordlist.txt"

    #################################################
    # Creating directory for storing splits if such a
    # directory does not already exists
    #################################################
    path = os.path.join(os.getcwd(), "wikisplits")

    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Wikisplits directory created")

    #################################################################
    # Splits up the file to facilitate data manipulation (size in MB)
    #################################################################
    print("Splitting file", listname, "...")
    fs = FileSplit(file=xmlfile_path, splitsize=5000000, output_dir=path)
    fs.split()

    print("File", xmlfile_path, "was split in directory", path)

    ###################################################
    # Proceeds to strip the files of characters, delete
    # smaller words, and clean up empty lines
    ###################################################

    # Removes all non-Alphanumeric Characters
    print("\nRemoving non-alphanumeric characters & words smaller than 4...")

    os.chdir(path)
    directory = os.getcwd()
    dirlist = sorted(os.listdir(directory))

    pool = Pool(processes=pros)
    for _ in tqdm(pool.imap(removal, dirlist), total=len(dirlist)):
        pass

    print(
        "\nNon-Alphanumeric characters and words of size smaller than 4 removed."
    )

    ########################################################
    # Merges the lists back together, eliminating duplicates
    # and then deletes the 'wikisplit' directory.
    ########################################################
    print("\nMerging lists back")
    worddict = {}
    for file in sorted(os.listdir(directory)):
        with open(file, 'r+', encoding="UTF-8") as f:
            for line in tqdm(f):
                if (len(line.strip()) == 0):
                    continue
                worddict[line.strip()] = 1

    fileresult = open(listname, 'w', encoding="UTF-8")
    for word in tqdm(sorted(list(worddict.keys()))):
        fileresult.write(word + '\n')
    fileresult.close()

    os.chdir('..')
    shutil.move(os.path.join(path, listname),
                os.path.join(os.getcwd(), "wikilists"))

    print(
        listname +
        " has been created. It contains all the words from the wikipedia dump minus duplicates, and has also been sorted in ascending order."
    )

    try:
        shutil.rmtree(path)
    except Exception as e:
        print('%s ' % e)
    else:
        print('\n%s was deleted' % path)
'''
Prerequisite : 
https://pypi.org/project/filesplit/

PS : Applicable for converting bat file as well :) 
'''

from fsplit.filesplit import FileSplit
fs = FileSplit(file=r"C:\Users\DHAVAL\Desktop\Old_resumes\really_big_file.txt",
               splitsize=500,
               output_dir=r"C:\Users\DHAVAL\Desktop\Old_resumes")
fs.split()
# pw = getpass()
# params['database']['password'] = pw
# pop_util = PopulationUtil(params['database'])

handler = MagController(params['mag_indexes'])
with open(params['mapping_path'], 'r') as handle:
    apn_maz_mapping = json.load(handle)

sliced = False

if sliced:
    st = os.stat(params['mag_path'])
    split_size = 300000000
    mag_dir = '/'.join(params['mag_path'].split('/')[0:-1])

    fs = FileSplit(params['mag_path'], split_size, output_dir=mag_dir)
    fs.split(include_header=True)

    for i in range(1, (st.st_size % split_size) + 1):
        params[
            'mag_path'] = f"../../data/raw_travel_data/output_disaggTripList_{i}.csv"

        plans = handler.plans_by_col(params['mag_path'],
                                     params['mag_column_ids'])
        population = handler.plans_to_population(plans)

        # with open(f'../../data/processed_MAG_data/population_{i}.dill', 'wb+') as handle:
        #     dill.dump(population, handle)

        # with open('../../data/processed_MAG_data/population.dill', 'rb') as handle:
        #     population: MagPopulation = dill.load(handle)
Exemple #20
0
 def split_csv(self):
     self.__fs = FileSplit(file=".".join(
         [self.__file_name, FILE_EXT]), splitsize=SPLIT_SIZE)
     return self.__fs.split(include_header=True)
def splitFile(fileName):
    fs = FileSplit(file=CONST_DOWN + fileName,
                   splitsize=CONST_BLOCK_SIZE * CONST_BYTES_PER_MB,
                   output_dir=(CONST_DOWN))
    fs.split()
    os.remove(CONST_DOWN + fileName)
Exemple #22
0
from fsplit.filesplit import FileSplit
from pathlib import Path
import myconf

split_folder = myconf.split_folder

Path(split_folder).mkdir(parents=True, exist_ok=True)
FileSplit(file=myconf.file_path,
          splitsize=myconf.number_of_lines_in_splitted_file,
          output_dir=split_folder).split()
#!/usr/bin/env python
# coding: utf-8

# In[1]:


pip install filesplit


# In[39]:


from fsplit.filesplit import FileSplit

f = FileSplit(file='C:/Users/Niteesha/Desktop/mendley/dataset.csv', splitsize=1848000)


# In[40]:


f.split()


# In[41]:


import hashlib


# In[42]:
Exemple #24
0
def split_file():
    fs = FileSplit(file='C:\\Users\\Lakshman\\Downloads\\site_hits.tsv',
                   splitsize=1684670,
                   output_dir='C:\\Users\\Lakshman\\Downloads\\splitfiletest')
    fs.split()
    fs.split(include_header=True)