def _split_file(self, path): pth = path pth_dirname = pth.parent outdir = pth_dirname.joinpath(pth.stem) outdir.mkdir(parents=True, exist_ok=True) fs = FileSplit(file=pth, splitsize=1_100_000, output_dir=outdir) fs.split()
def main(filename): pth = Path(filename).resolve() assert pth.exists(), pth pth_dirname = pth.parent outdir = pth.joinpath(pth.stem) outdir_relative = outdir.relative_to(pth) outdir_relative.mkdir(parents=True, exist_ok=True) fs = FileSplit(file=pth, splitsize=1_100_000, output_dir=outdir_relative) fs.split()
def sharding(file_name): try: os.makedirs('sample_sharded') except: pass filesize = str(os.path.getsize(file_name)).zfill(16) fs = FileSplit(file=file_name, splitsize=int(filesize) / 4, output_dir="sample_sharded/") fs.split()
def write_func(outputs, filenames): for filename in filenames: fndir = filename.parent fndir.mkdir(parents=True, exist_ok=True) for output, filename in zip(outputs, filenames): with open(filename, 'w') as f: f.write(output.getvalue()) for filename in filenames: dirname = filename.parent new_pth_cmpts = dirname.relative_to(self.basedir) outdir = self.basedir.joinpath(new_pth_cmpts) fs = FileSplit(file=filename, splitsize=1_100_000, output_dir=outdir) fs.split()
def upload_to_hdfs(input_dir, output_dir, chunk_size): # locate files in directory files = [ os.path.abspath("{}/{}".format(input_dir, f)) for f in listdir(input_dir) if isfile(join(input_dir, f)) ] tmp_dir = "{}/tmp".format(input_dir) # setup temp dir if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) # split files into 128mb chunks for f in files: fs = FileSplit(file=f, splitsize=(chunk_size) * 1e6, output_dir=tmp_dir) fs.split(callback=split_callback) # upload to hdfs hdfs_client = InsecureClient("http://{}:9870".format( settings.HDFS_HOST_VALUE), user=settings.HDFS_USER_VALUE) # delete existing output dir if hdfs_client.content(output_dir, strict=False) != None: hdfs_client.delete(output_dir, recursive=True) # upload files to tmp dir remote_path = hdfs_client.upload(hdfs_path="/tmp", local_path=tmp_dir, n_threads=-1, overwrite=True) # rename to output_dir hdfs_client.rename("/tmp", output_dir) print( "{} files uploaded to hdfs host '{}{}' ({} file chunks total)".format( len(files), settings.HDFS_HOST_VALUE, output_dir, len(split_files), )) # delete temp files shutil.rmtree(tmp_dir) return hdfs_file_paths
def split_file(file_to_split, number_of_chunks): file_size = os.path.getsize(file_to_split) split_size = file_size/(number_of_chunks-1) splitted_file_path = os.path.join(settings.MEDIA_ROOT, "splitted_file") # os.makedirs(splitted_file_path) fs = FileSplit(file_to_split, split_size, splitted_file_path) try: fs.split() os.remove(file_to_split) except e as Exception: print(e) return False else: return splitted_file_path
def main(): dev_limits = config_get_dev_limits() token = api_get_token() pair_index = 0 for dev_upload_path in config_get_dev_paths()["upload_pairs"]: file_id = api_get_file_id(token, None, pair_index) upload_url = api_create_upload_session(token, pair_index) total_size = fs_get_upload_size(pair_index) if total_size > dev_limits["upload_partition_limit"]: limit = dev_limits["upload_partition_limit"] print_message( "Greater than " + str(limit / 1024 / 1024) + " MB - need to split into chunks", "UPLOAD", "verbose") with tempfile.TemporaryDirectory( dir=fs_get_parent_dir()) as tmpdirname: print_message("Created temporary directory: " + tmpdirname, "UPLOAD", "verbose") fs = FileSplit(file=dev_upload_path, splitsize=limit, output_dir=tmpdirname) fs.split() chunks = get_chunks(tmpdirname, pair_index) start_byte = 0 for chunk_name in chunks: chunk_path = fs_get_chunk_full_path(tmpdirname, chunk_name) chunk_size = fs_get_chunk_size(chunk_path) if chunk_size > limit: raise RuntimeError( "There was a problem partitioning the tar file") with open(chunk_path, 'rb') as chunk: payload = chunk.read() api_upload_chunk(upload_url, start_byte, start_byte + chunk_size - 1, total_size, payload) start_byte += chunk_size else: print_message("Uploading entire file in one chunk", "UPLOAD", "verbose") with open(dev_upload_path, 'rb') as file: payload = file.read() api_upload_chunk(upload_url, 0, total_size - 1, total_size, payload) maintain_size(token, file_id, pair_index) pair_index += 1
def dispatchFile(fileName: str, nodeList: list): fs = FileSplit( file=fileName, splitsize=math.ceil(os.path.getsize(fileName) / len(nodeList)) + 10, output_dir='.') def func(filePath, size, count): print("Dispatching file: {0}, size: {1}, count: {2}".format( filePath, size, count)) num = int(filePath.split('_')[-1].split('.')[0]) - 1 send_file(nodeList[num], filePath, filePath.split('/')[-1]) os.remove(filePath) fs.split(callback=func)
def split_file(root_path, split_size_mb, output_path): # Converts Mb to Bytes split_size_mb = split_size_mb * 1048576 # Collect files from root path file_path = collect_files(root_path) for file in file_path: # Create Output Directory export_path = create_dir(file, output_path) # Reads File fs = FileSplit(file=file, splitsize=split_size_mb, output_dir=export_path) # Splits file fs.split(include_header=True, callback=split_file_stats)
class DMFCSV: def __init__(self, database="", table_name="", file_data=[]): self.__file_name = "_".join([database, table_name]) self.__data = file_data def write_csv(self): with open(".".join([self.__file_name, FILE_EXT]), 'w') as csvfile: self.__data_writer = csv.writer(csvfile) self.__data_writer.writerows(self.__data) return True def split_csv(self): self.__fs = FileSplit(file=".".join( [self.__file_name, FILE_EXT]), splitsize=SPLIT_SIZE) return self.__fs.split(include_header=True)
def app_main(config): """ Main loop of the program 1. Split the input file into chunks 2. Create a pool of worker processes and distribute the chunks between the processes 3. Write the combined dataframe to a file """ logger.info("in app_main") # initialize some variables that we need dns_data = None logger.info("configured action is {}".format(config.action)) run_dir = os.path.join(g.DATA_DIR, g.DATASET_VER, g.RUN_NAME) os.makedirs(run_dir, exist_ok=True) if config.action == g.ACTION_WRANGLE_ONLY or config.action == g.ACTION_RUN_ALL: logger.info("running data wrangling operations...") # Step 0: read data needed by all worker threads file_path = os.path.join(g.RAW_DATA_DIR, g.DATASET_VER, g.CCTLD_FILE_NAME) ccTLD = pd.read_csv(file_path) logger.info("read country code top level domains...") logger.info(ccTLD.head()) # step 1, split the file into chunks # we expect the file to be within the current project, soon this will be replaced # with download from a dataset repo if not present so the file is downloaded one time # and then available for use as long as the version number does not change (DATASET_VER) src_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(src_dir, g.RAW_DATA_DIR, g.DATASET_VER, config.filename) output_dir = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER, g.SPLITS_DIR) p = Path(file_path) if p.exists() is False: logger.error("%s does not exist, existing %s", file_path, g.APP_NAME) sys.exit() else: logger.info("%s found, going to split it into %sMB sized chunks now", file_path, config.splitsizemb) os.makedirs(output_dir, exist_ok=True) split_size = g.MB*int(config.splitsizemb) # the split will create a malformed last line because it is size based and not line based # but that is ok..we have millions of lines so a few do not matterHOURLY_UNIQUE_SRC_IP_COUNT_PER_LOW_TFIDF_QUERY fs = FileSplit(file_path, split_size, output_dir) fs.split() logger.info("file split complete..moving to wrangling stage now") file_list = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))] logger.info("file names for the splits are {}".format(file_list)) # step 2: wrangle data dns_data, _, fqdn_stats, _, hourly_per_query_counts, _, _, _, _, _, hourly_unique_src_ip_counts = wrangle_data(file_list, ccTLD) logger.info("done wrangling data...") # step 3: get tfidf values for each domain so we can identify stop domains like stop words logger.info("going to do tfidf on domain names to figure out stop domains now..") fqdn_tfidf, fqdn_tfidf_subset = do_tfidf_transform_on_query_names() # step 3.1, create file with per hour query count most non-informative domains (from our perspective of # figuring out similarities between domain names) low_tfidf_domains = fqdn_tfidf_subset['fqdn'] hourly_per_query_counts_low_tfidf = hourly_per_query_counts[hourly_per_query_counts['query'].isin(low_tfidf_domains)] file_path = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER, g.HOURLY_PER_QUERY_COUNT_LOW_TFIDF_TIMESERIES_FILE_NAME) hourly_per_query_counts_low_tfidf.to_csv(file_path, index=False) # step 3.2, create file with per hour unique src ip counts most non-informative domains # (from our perspective of figuring out similarities between domain names) hourly_unique_src_ip_counts_low_tfidf = hourly_unique_src_ip_counts[hourly_unique_src_ip_counts['query'].isin(low_tfidf_domains)] file_path = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER, g.HOURLY_UNIQUE_SRC_IP_COUNT_PER_LOW_TFIDF_QUERY) hourly_unique_src_ip_counts_low_tfidf.to_csv(file_path, index=False) # step 3.3, create file with per hour unique src ip counts for top 100 most frequently accessed domains hourly_unique_src_ip_counts_for_most_freq = hourly_unique_src_ip_counts[hourly_unique_src_ip_counts['query'].isin(fqdn_stats.tail(g.N_FOR_FQDNS_W_LOWEST_TFIDF)['query'])] file_path = os.path.join(src_dir, g.DATA_DIR, g.DATASET_VER, g.HOURLY_UNIQUE_SRC_IP_COUNT_MOST_FREQ_FQDNS) hourly_unique_src_ip_counts_for_most_freq.to_csv(file_path, index=False) # going to do some post processing on dns data # 1. remove the low tfidf domains i.e. treat them like stopwords in a sentence # 2. shorten the domain names to only keep the most significant part of it # 3. write to dnsdata file logger.info("going to post process dns data from previous step to remove stop domain names") dns_data = post_process_dns_data(dns_data, low_tfidf_domains, ccTLD) file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.WRANGLED_DATASET_FILE_NAME) dns_data.to_csv(file_path, index=False) logger.info("wrote the final dns dataset to {}".format(file_path)) if config.action == g.ACTION_ANALYZE_ONLY or config.action == g.ACTION_RUN_ALL: logger.info("running analysis...") if dns_data is None: # read the dns dataset created as part of the analysis phase, either in this run # or the previous one file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.WRANGLED_DATASET_FILE_NAME) dns_data = pd.read_csv(file_path) # we have the data, either from the previous step or reading from the file logger.info(dns_data['query'].head()) # read each line from the dataframe, split it to convert it into an array # because word2vec needs an array of arrays (sentences).. # but before we do that we also need to typecast each # individual domain name in the "sentence" to a string because some domain names like # 169.254 make Python think this is a float.. # queries_as_sentences = [[q for q in str(queries).split() if q.startswith("www")] for queries in dns_data['query']] queries_as_sentences = [str(queries).split() for queries in dns_data['query']] # write this list of lists to a file, we would use this as input to an LSTM model # to predict what domain name comes after a sequence of domains file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.FINAL_INPUT_TO_W2V_MODEL_FILE_NAME) with open(file_path,"w") as f: wr = csv.writer(f) wr.writerows(queries_as_sentences) # logger.info([len(q) for q in queries_as_sentences]) # run word2vec # log all the word2vec config to a file for record purposes file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.RUN_NAME, g.W2V_PARAMS_FILE_NAME) with open(file_path, "w") as w2v_params_file: w2v_parms = { 'run_name': g.RUN_NAME, 'num_sentences': len(queries_as_sentences), 'embedding_size': g.W2V_EMBEDDING_SIZE, 'window_size': g.W2V_WINDOW, 'min_count': g.W2V_MIN_COUNT, 'negative': g.W2V_NEGATIVE_SAMPLING, 'max_vocab_size': g.W2V_MAX_VOCAB_SIZE, 'sample': g.W2V_SAMPLE, 'ns_exponent': g.W2V_NS_EXPONENT, 'num_workers': g.W2V_WORKERS, 'sg': g.W2V_USE_SG, 'epochs': g.WV_EPOCHS, 'seed': g.WV_SEED } w2v_params_file.write(json.dumps(w2v_parms, indent=2)) model_dns = Word2Vec(sentences=queries_as_sentences, size=g.W2V_EMBEDDING_SIZE, window=g.W2V_WINDOW, min_count=g.W2V_MIN_COUNT, workers=g.W2V_WORKERS, sg=g.W2V_USE_SG, iter=g.WV_EPOCHS, negative = g.W2V_NEGATIVE_SAMPLING, max_vocab_size=g.W2V_MAX_VOCAB_SIZE, sample = g.W2V_SAMPLE, ns_exponent = g.W2V_NS_EXPONENT, seed=g.WV_SEED, compute_loss=True, callbacks=[EpochLogger()]) file_path = os.path.join(g.DATA_DIR, g.DATASET_VER, g.RUN_NAME, g.DNS_DATASET_FILE_NAME.split('.')[0] + g.MODEL_FILE_SUFFIX) logger.info("saving Dns2Vc model to {}".format(file_path)) model_dns.save(file_path) logger.info("going to save dns vectors...") save_dns_vectors_for_tensorboard(model_dns) logger.info("going to run tests to predict similar domains...") run_dns2vec_tests(model_dns) logger.info("nothing more to do..")
def run_import_file( account, user, password, database, warehouse, schema, schema_table, file_path, delimiter=",", ): row_count = 0 db = Database( account, user, password, database, warehouse, schema, ) try: # Uppercase table schema_table = schema_table.upper() # File dir and name path_parts = os.path.split(file_path) file_dir = path_parts[0] file_name = path_parts[1] # Tablename underscored table_name_text = file_name.replace('.', '_') # Fileformat try: table_format = table_name_text + "_format" create_format_sql = ( "create or replace file format " + table_format + " type = 'CSV' field_delimiter = '" + delimiter #+ "' skip_header = 1; " + "'; ") logger.debug(create_format_sql) db.execute(create_format_sql) except: logger.error('Failed create fileformat: ' + file_path) # Stage try: table_stage = table_name_text + "_stage" create_stage_sql = ("create or replace stage " + table_stage + " file_format = " + table_format + ";") logger.debug(create_stage_sql) db.execute(create_stage_sql) except: logger.error('Failed create stage: ' + file_path) # Split files fs = FileSplit(file=file_path, splitsize=50000000, output_dir=file_dir) fs.split() os.remove(file_path) # put try: files = file_path[:-4] + "*.csv" put_sql = ("PUT file://" + files + " @" + table_stage + " auto_compress=true;") logger.debug(put_sql) db.execute(put_sql) except: logger.error('Failed PUT: ' + file_path) # copy try: copy_sql = ("COPY INTO " + schema_table + " FROM @" + table_stage + " file_format = (format_name = " + table_format + ") on_error = 'CONTINUE';") logger.debug(copy_sql) sfqid = db.execute(copy_sql).sfqid logger.debug("copy table success") logger.debug('Snowflake copy query id: ' + sfqid) sfqid = "'" + sfqid + "'" except: logger.error('Failed COPY: ' + file_path) try: qstring = 'SELECT * FROM TABLE(RESULT_SCAN({}))' load_result = db.execute(qstring.format(sfqid)).fetchall() for res in load_result: logger.debug(res) row_count = sum([row[3] for row in load_result]) if len([row[1] for row in load_result if row[1] == 'LOAD_FAILED']) > 0: logger.error('Load completed with errors') except: logger.error('Failed getting load results: ' + file_path) # remove stage drop_stage_sql = "DROP STAGE IF EXISTS " + table_stage logger.debug(drop_stage_sql) db.execute(drop_stage_sql) logger.debug("stage deleted") # remove fileformat drop_file_format_sql = "DROP FILE FORMAT IF EXISTS " + table_format logger.debug(drop_file_format_sql) db.execute(drop_file_format_sql) logger.debug("format deleted") logger.debug(str(row_count) + " records imported") return row_count except: logger.error("Failed importing file: " + file_path) return row_count
from fsplit.filesplit import FileSplit fs = FileSplit(file='./wiki.json', splitsize=10240000, output_dir='../data') fs.split()
def main(): print("Wikipedia Wordlist Extractor") xmlfile_path = args.n pros = int(args.p) listname = args.f[:-4] + "-wordlist.txt" ################################################# # Creating directory for storing splits if such a # directory does not already exists ################################################# path = os.path.join(os.getcwd(), "wikisplits") try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Wikisplits directory created") ################################################################# # Splits up the file to facilitate data manipulation (size in MB) ################################################################# print("Splitting file", listname, "...") fs = FileSplit(file=xmlfile_path, splitsize=5000000, output_dir=path) fs.split() print("File", xmlfile_path, "was split in directory", path) ################################################### # Proceeds to strip the files of characters, delete # smaller words, and clean up empty lines ################################################### # Removes all non-Alphanumeric Characters print("\nRemoving non-alphanumeric characters & words smaller than 4...") os.chdir(path) directory = os.getcwd() dirlist = sorted(os.listdir(directory)) pool = Pool(processes=pros) for _ in tqdm(pool.imap(removal, dirlist), total=len(dirlist)): pass print( "\nNon-Alphanumeric characters and words of size smaller than 4 removed." ) ######################################################## # Merges the lists back together, eliminating duplicates # and then deletes the 'wikisplit' directory. ######################################################## print("\nMerging lists back") worddict = {} for file in sorted(os.listdir(directory)): with open(file, 'r+', encoding="UTF-8") as f: for line in tqdm(f): if (len(line.strip()) == 0): continue worddict[line.strip()] = 1 fileresult = open(listname, 'w', encoding="UTF-8") for word in tqdm(sorted(list(worddict.keys()))): fileresult.write(word + '\n') fileresult.close() os.chdir('..') shutil.move(os.path.join(path, listname), os.path.join(os.getcwd(), "wikilists")) print( listname + " has been created. It contains all the words from the wikipedia dump minus duplicates, and has also been sorted in ascending order." ) try: shutil.rmtree(path) except Exception as e: print('%s ' % e) else: print('\n%s was deleted' % path)
# params['database']['password'] = pw # pop_util = PopulationUtil(params['database']) handler = MagController(params['mag_indexes']) with open(params['mapping_path'], 'r') as handle: apn_maz_mapping = json.load(handle) sliced = False if sliced: st = os.stat(params['mag_path']) split_size = 300000000 mag_dir = '/'.join(params['mag_path'].split('/')[0:-1]) fs = FileSplit(params['mag_path'], split_size, output_dir=mag_dir) fs.split(include_header=True) for i in range(1, (st.st_size % split_size) + 1): params[ 'mag_path'] = f"../../data/raw_travel_data/output_disaggTripList_{i}.csv" plans = handler.plans_by_col(params['mag_path'], params['mag_column_ids']) population = handler.plans_to_population(plans) # with open(f'../../data/processed_MAG_data/population_{i}.dill', 'wb+') as handle: # dill.dump(population, handle) # with open('../../data/processed_MAG_data/population.dill', 'rb') as handle: # population: MagPopulation = dill.load(handle)
def splitFile(fileName): fs = FileSplit(file=CONST_DOWN + fileName, splitsize=CONST_BLOCK_SIZE * CONST_BYTES_PER_MB, output_dir=(CONST_DOWN)) fs.split() os.remove(CONST_DOWN + fileName)
def split_csv(file_name='', file_ext='csv'): fs = FileSplit(file=".".join([file_name, file_ext]), splitsize=10) # fs = FileSplit(file=".".join([file_name, file_ext]), splitsize=1000000000) return fs.split(include_header=True)
pip install filesplit # In[39]: from fsplit.filesplit import FileSplit f = FileSplit(file='C:/Users/Niteesha/Desktop/mendley/dataset.csv', splitsize=1848000) # In[40]: f.split() # In[41]: import hashlib # In[42]: file = "dataset_1.csv" # Location of the file (can be set a different way) BLOCK_SIZE = 65536 # The size of each read from the file file_hash = hashlib.sha256() # Create the hash object, can use something other than `.sha256()` if you wish
def split_file(): fs = FileSplit(file='C:\\Users\\Lakshman\\Downloads\\site_hits.tsv', splitsize=1684670, output_dir='C:\\Users\\Lakshman\\Downloads\\splitfiletest') fs.split() fs.split(include_header=True)