def test_fix_asa(self, mock_call): self._touch_files() # For the ASA the protocols get fixed flow_line = ('198.22.253.72,192.168.207.199,80,61391,6,' '58,1,1459535021,1459535021\n') fixable_line = ('192.168.207.199,198.22.253.72,61391,80,0,' '59,2,1459535022,1459535022\n') fixed_line = ('192.168.207.199,198.22.253.72,61391,80,6,' '59,2,1459535022,1459535022\n') unfixable_line = ('192.168.207.200,198.22.253.72,61391,80,0,' '59,2,1459535022,1459535022\n') def side_effect(*args, **kwargs): if 'rwuniq' not in args[0][0]: return 0 with io.open(args[0][14], 'wt') as f: f.write(flow_line) f.write(fixable_line) f.write(unfixable_line) return 0 mock_call.side_effect = side_effect env_override = {'OBSRVBL_IPFIX_PROBE_1_SOURCE': 'asa'} with patch.dict('ona_service.ipfix_pusher.environ', env_override): inst = self._get_instance(IPFIXPusher) input_paths = [join(self.input_dir, x) for x in self.ready[0:1]] inst._process_files(input_paths) with gz_open(input_paths[0], 'rt') as infile: lines = infile.readlines() self.assertEqual(lines[0], CSV_HEADER + '\n') self.assertEqual(lines[1], flow_line) self.assertEqual(lines[2], fixed_line) self.assertEqual(lines[3], unfixable_line)
def test_fix_sonicwall(self, mock_call): self._touch_files() # For SonicWALL the timestamps get replaced flow_line = ('198.22.253.72,192.168.207.199,80,61391,6,' '58,1,1459535021,1459535021\n') altered_line = ('198.22.253.72,192.168.207.199,80,61391,6,' '58,1,1395669540,1395669540\n') def side_effect(*args, **kwargs): if 'rwuniq' not in args[0][0]: return 0 with io.open(args[0][14], 'wt') as f: f.write(flow_line) return 0 mock_call.side_effect = side_effect env_override = {'OBSRVBL_IPFIX_PROBE_1_SOURCE': 'sonicwall'} with patch.dict('ona_service.ipfix_pusher.environ', env_override): inst = self._get_instance(IPFIXPusher) input_paths = [join(self.input_dir, x) for x in self.ready[1:2]] inst._process_files(input_paths) with gz_open(input_paths[0], 'rt') as infile: lines = infile.readlines() self.assertEqual(lines[0], CSV_HEADER + '\n') self.assertEqual(lines[1], altered_line)
def extract_gz(file: str, path: str) -> str: outfile = os_path.join(path, os_path.basename(file[:-3])) makedirs(path, exist_ok=True) with gz_open(file, 'rb') as f_in: with open(outfile, 'wb') as f_out: copyfileobj(f_in, f_out) return outfile
def download_file(self, name): """Download file from GitHub archive. :param name: name of GitHub archive in format YYYY-MM-DD-h :return: name of JSON file with data if downloading was successful else None """ #TODO: handle exceptions archive_name = name + ".json.gz" file_name = join(self.new_data_dir, name + ".json") try: urlretrieve("http://data.githubarchive.org/" + archive_name, filename=join(self.downloaded_data_dir, archive_name)) except IOError: self.logger.error(__name__ + ": " + "unable to download file (error creating connection).") try: archive = gz_open(join(self.downloaded_data_dir, archive_name)) except IOError: self.logger.error(__name__ + ": " + "unable to open gzipped file (file not created).") else: json_file = open(file_name, "w") json_file.write(archive.read()) archive.close() json_file.close() remove(join(self.downloaded_data_dir, archive_name)) return file_name
def _read_file(self, key): resp = self.boto_client.get_object(Bucket=self.bucket, Key=key) with gz_open(resp['Body'], mode='rt') as gz_f: reader = DictReader(gz_f, delimiter=' ') reader.fieldnames = [ f.replace('-', '_') for f in reader.fieldnames ] yield from reader
def download(url, file_path): r = get(url, stream=True) with open(file_path, 'wb') as fd: for chunk in r.iter_content(chunk_size=1024): fd.write(chunk) mrc_path = file_path.replace('.gz', '.mrc') with gz_open(file_path, 'rb') as f_in: with open(mrc_path, 'wb') as f_out: copyfileobj(f_in, f_out) remove(file_path)
def _read_file(self, key): resp = self.boto_client.get_object(Bucket=self.bucket, Key=key) with gz_open(resp['Body'], mode='rt') as gz_f: reader = DictReader(gz_f, delimiter=' ') reader.fieldnames = [ f.replace('-', '_') for f in reader.fieldnames ] yield from reader with THREAD_LOCK: self.bytes_processed += gz_f.tell() self.compressed_bytes_processed += resp['ContentLength']
def load_data_frame(file_path: str) -> DataFrame: logger.info("Reading file `%s`", file_path) if file_path.endswith(".csv.gz"): with gz_open(file_path) as file: return read_csv(file) elif file_path.endswith(".csv"): return read_csv(file_path) elif file_path.endswith(".gitkeep"): return DataFrame() raise RuntimeError(f"Unrecognizable file type: {file_path}")
def _process_files(self, file_list): for file_path in file_list: file_dir, file_name = split(file_path) temp_path = join(file_dir, '{}.tmp'.format(file_name)) copy(file_path, temp_path) try: all_rows = self._get_nvz_flows(temp_path) with gz_open(file_path, 'wt') as outfile: writer = DictWriter(outfile, CSV_HEADER) writer.writeheader() writer.writerows(all_rows) finally: remove(temp_path)
def rotate(self): log_file = self.log_file # We save in a file ? if log_file is None: return False # Save the current log # XXX In a multithreads context, we must add a lock here new_name = log_file + '.' + strftime('%Y-%m-%d_%H%M') # We don't delete an existing file if exists(new_name + '.gz'): # If here, interval < 1min return self.rotate_interval # Yet a log file ? if exists(log_file): # Yes, we move it move(log_file, new_name) else: # No, we create an empty one open(new_name, 'w').close() # Compress it f_in = open(new_name, 'rb') f_out = gz_open(new_name + '.gz', 'wb') f_out.writelines(f_in) f_out.close() f_in.close() remove(new_name) # Suppress the old files files = [] n2 = '[0-9][0-9]' date_pattern = n2 + n2 + '-' + n2 + '-' + n2 + '_' + n2 + n2 for name in glob(log_file + '.' + date_pattern + '.gz'): try: date = datetime.strptime(name[-18:-3], '%Y-%m-%d_%H%M') except ValueError: continue files.append((date, name)) files.sort(reverse=True) for a_file in files[LOG_FILES_NUMBER:]: remove(a_file[1]) # We return always True to be "cron" compliant return self.rotate_interval
def rotate(self): log_file = self.log_file # We save in a file ? if log_file is None: return False # Save the current log # XXX In a multithreads context, we must add a lock here new_name = log_file + '.' + strftime('%Y-%m-%d_%H%M') # We don't delete an existing file if exists(new_name + '.gz'): # If here, interval < 1min return self.rotate_interval # Yet a log file ? if exists(log_file): # Yes, we move it move(log_file, new_name) else: # No, we create an empty one open(new_name, 'w').close() # Compress it f_in = open(new_name, 'rb') f_out = gz_open(new_name + '.gz', 'wb') f_out.writelines(f_in) f_out.close() f_in.close() remove(new_name) # Suppress the old files files = [] n2 = '[0-9][0-9]' date_pattern = n2 + n2 + '-' + n2 + '-' + n2 + '_' + n2 + n2 for name in glob(log_file + '.' + date_pattern + '.gz'): try: date = datetime.strptime(name[-18:-3], '%Y-%m-%d_%H%M') except ValueError: continue files.append( (date, name) ) files.sort(reverse=True) for a_file in files[LOG_FILES_NUMBER:]: remove(a_file[1]) # We return always True to be "cron" compliant return self.rotate_interval
def _check_point_to_csv(self, send_segment, now): # Writes files to the "input" directory so the pusher will find them, # archive them, and send them out. # The input directory may not have been created yet create_dirs(self.input_dir) segment_data = self.log_node.parsed_data.pop(send_segment, []) if not segment_data: return file_name = '{}_{}.csv.gz'.format(send_segment.strftime(self.file_fmt), now.strftime(self.file_fmt)) file_path = join(self.input_dir, file_name) with gz_open(file_path, 'wt') as outfile: writer = DictWriter(outfile, CSV_HEADER) writer.writeheader() writer.writerows(self._format_item(x) for x in segment_data)
def test_fix_meraki(self, mock_call): self._touch_files() # Intermediate report - will get subsumed by the next line flow_line_1 = ('198.22.253.72,192.168.207.199,80,61391,6,' '58,1,1459535021,1459535021\n') # Final report before reset - this will show up in the output flow_line_2 = ('198.22.253.72,192.168.207.199,80,61391,6,' '158,1,1459535021,1459535021\n') # After the reset - this won't show up flow_line_3 = ('198.22.253.72,192.168.207.199,80,61391,6,' '157,1,1459535021,1459535021\n') # This one shows up because it's the last report flow_line_4 = ('198.22.253.72,192.168.207.199,80,61391,6,' '159,1,1459535021,1459535021\n') altered_line_1 = ('192.168.207.199,198.22.253.72,61391,80,6,' '158,1,1395669540,1395669540\n') altered_line_2 = ('192.168.207.199,198.22.253.72,61391,80,6,' '159,1,1395669540,1395669540\n') def side_effect(*args, **kwargs): if 'rwcut' not in args[0][0]: return 0 with io.open(args[0][11], 'wt') as f: f.write(flow_line_1) f.write(flow_line_2) f.write(flow_line_3) f.write(flow_line_4) return 0 mock_call.side_effect = side_effect env_override = {'OBSRVBL_IPFIX_PROBE_1_SOURCE': 'meraki'} with patch.dict('ona_service.ipfix_pusher.environ', env_override): inst = self._get_instance(IPFIXPusher) input_paths = [join(self.input_dir, x) for x in self.ready[1:2]] inst._process_files(input_paths) with gz_open(input_paths[0], 'rt') as infile: lines = infile.readlines() self.assertEqual(len(lines), 1 + 2) # Header + Rows self.assertEqual(lines[0], CSV_HEADER + '\n') self.assertEqual(lines[1], altered_line_1) self.assertEqual(lines[2], altered_line_2)
def _read_file(self, key): resp = self.boto_client.get_object(Bucket=self.bucket, Key=key) if key.endswith('.parquet'): body = resp['Body'].read() reader = parquet_dict_reader(io.BytesIO(body)) yield from reader with THREAD_LOCK: self.bytes_processed += len(body) self.compressed_bytes_processed += resp['ContentLength'] else: with gz_open(resp['Body'], mode='rt') as gz_f: reader = csv_dict_reader(gz_f, delimiter=' ') reader.fieldnames = [ f.replace('-', '_') for f in reader.fieldnames ] yield from reader with THREAD_LOCK: self.bytes_processed += gz_f.tell() self.compressed_bytes_processed += resp['ContentLength']
def compress_gz(inFile: str, outFile: str = '', delete: bool = False) -> str: """ Compress 'inFile' into gzip format. Parameters: inFile -- file to compress outFile -- (Optional) path of compressed file (default: ""), otherwise file is compressed in place delete -- (Optional) the original path is deleted after compression (default: False) Returns: The archive filename """ if not outFile: outFile = inFile + '.gz' with open(inFile, 'rb') as f_in, gz_open(outFile, 'wb') as f_out: f_out.writelines(f_in) if delete: remove(inFile) return outFile
def main(): # fr_type = {} # fr_type['fasta'] = read_fasta # fr_type['fastq'] = read_fastq # sequence_read_fp = sys.argv[1] sequence_read_fp = args['input'] fw = open(args["output"],'w') if sequence_read_fp.endswith('.gz'): sequence_read_f = gz_open(sequence_read_fp, 'rb') else: sequence_read_f = open(sequence_read_fp, 'U') if args['file_type'] == 'fastq': for seq_name, seq_base ,seq_qual in read_fastq(sequence_read_f): # print "%s\n%s\n%s" %(seq_name,seq_base,seq_qual) fw.write("%s\n%s\n%s" %(seq_name,seq_base,seq_qual)+"\n") if args['file_type'] == 'fasta': for seq_name, seq_base in read_fasta(sequence_read_f): # print ">%s\n%s" %(seq_name,seq_base) fw.write(">%s\n%s" %(seq_name,seq_base)+"\n") sequence_read_f.close() fw.close()
def test_csv_header(self, mock_call): self._touch_files() flow_line = ('198.22.253.72,192.168.207.199,80,61391,6,' '58,1,1459535021,1459535021\n') def side_effect(*args, **kwargs): if 'rwuniq' not in args[0][0]: return 0 with io.open(args[0][14], 'wt') as f: f.write(flow_line) return 0 mock_call.side_effect = side_effect inst = self._get_instance(IPFIXPusher) input_paths = [join(self.input_dir, x) for x in self.ready[0:1]] inst._process_files(input_paths) with gz_open(input_paths[0], 'rt') as infile: lines = infile.readlines() self.assertEqual(lines[0], CSV_HEADER + '\n') self.assertEqual(lines[1], flow_line)
def test_process_files(self): # Write some test data file_path = join(self.input_dir, 'nvzflow.log') with io.open(file_path, 'wt') as f: for line in LOG_DATA: print(json.dumps(line).decode('utf-8'), file=f) # Process it inst = self._get_instance(NVZFlowPusher) inst._process_files([file_path]) # It should have turned from JSON-object-per-line to CSV with header with gz_open(file_path, 'rt') as f: actual = f.read() expected = ('srcaddr,dstaddr,srcport,dstport,protocol,' 'bytes_in,bytes_out,start,end\r\n' '2001:db8::8e18,2001:db8::100a,58572,53,17,' '0,0,1522955857,1522955857\r\n' '192.0.2.29,198.51.100.50,56209,443,6,' '5113,639,1522955826,1522955857\r\n' '0.0.0.0,198.51.100.50,0,443,6,' '0,0,1522955857,1522955857\r\n') self.assertEqual(actual, expected)
def load_and_preprocess_data(data_set=mystery, word_embedding_path='word2vec.pkl.gz', las=True, max_batch_size=2048, transition_cache=None, seed=1234): """Get train/test data See TrainingIterable for description of args Returns: a tuple of - a Transducer object - a word embedding matrix - a training data iterable - an iterable over dev sentences - an iterable over dev dependencies (arcs) - an iterable over test sentences - an iterable over test dependencies (arcs) """ print('Loading word embeddings...') stdout.flush() if word_embedding_path.endswith('.gz'): with gz_open(word_embedding_path, 'rb') as file_obj: word_list, word_embeddings = load(file_obj) else: with open(word_embedding_path, 'rb') as file_obj: word_list, word_embeddings = load(file_obj) # add null embedding (we'll initialize later) word_embeddings = np.append( word_embeddings, np.empty((1, word_embeddings.shape[1]), dtype=np.float32), axis=0 ) print('There are {} word embeddings.'.format(word_embeddings.shape[0])) print('Getting training sentences...') stdout.flush() training_graphs = data_set.parsed_sents('train.conll') metadata_path = Path(data_set.root.path) / 'meta.pkl' trn_sent, trn_ex = load_metadata(metadata_path) tag_set, deprel_set = set(), set() with tqdm(total=trn_sent or None, leave=False, unit='sent') as progbar: for graph in training_graphs: for node in graph.nodes.values(): if node['address']: # not root tag_set.add(node['ctag']) deprel_set.add(node['rel']) progbar.update() tag_list = sorted(tag_set) if las: deprel_list = sorted(deprel_set) status = 'There are {} tags and {} deprel labels' status = status.format(len(tag_list), len(deprel_list)) else: deprel_list = [] status = 'There are {} tags' status = status.format(len(tag_list)) transducer = Transducer(word_list, tag_list, deprel_list) training_data = TrainingIterable(training_graphs, transducer, seed=seed, max_batch_size=max_batch_size, las=las, transition_cache=transition_cache, n_ex=trn_ex) # use training's rng to initialize null embedding word_embeddings[-1] = training_data.rng.uniform(-.01, .01, 50) print(status, 'from {} training sentences.'.format(training_data.graphs_len)) save_metadata(metadata_path, len(training_graphs), len(training_data)) print('Getting dev sentences...') stdout.flush() dev_sentences = data_set.tagged_sents('dev.conll') dev_arcs = tuple(list(transducer.graph2arc(graph, include_deprel=las)) for graph in tqdm(data_set.parsed_sents('dev.conll'), leave=False, unit='sent') ) print('There are {} dev sentences.'.format(len(dev_arcs))) print('Getting test sentences...') stdout.flush() test_sentences = data_set.tagged_sents('test.conll') test_arcs = tuple(list(transducer.graph2arc(graph, include_deprel=las)) for graph in tqdm(data_set.parsed_sents('test.conll'), leave=False, unit='sent') ) print('There are {} test sentences.'.format(len(test_arcs))) return (transducer, word_embeddings, training_data, dev_sentences, dev_arcs, test_sentences, test_arcs)
def load_and_preprocess_data(data_set=ud_english, word_embedding_path='word2vec.pkl.gz', las=True, max_batch_size=2048, transition_cache=None, seed=1234): '''Get train/test data See TrainingIterable for description of args Returns: a tuple of - a Transducer object - a word embedding matrix - a training data iterable - an iterable over dev sentences - an iterable over dev dependencies (arcs) - an iterable over test sentences - an iterable over test dependencies (arcs) ''' print('Loading word embeddings...', end='') stdout.flush() if word_embedding_path.endswith('.gz'): with gz_open(word_embedding_path, 'rb') as file_obj: word_list, word_embeddings = load(file_obj) else: with open(word_embedding_path, 'rb') as file_obj: word_list, word_embeddings = load(file_obj) # add null embedding (we'll initialize later) word_embeddings = np.append(word_embeddings, np.empty((1, word_embeddings.shape[1]), dtype=np.float32), axis=0) print('there are {} word embeddings.'.format(word_embeddings.shape[0])) print('Determining POS tags...', end='') stdout.flush() tag_set = set() for sentence in data_set.tagged_sents('train.conll'): for _, tag in sentence: tag_set.add(tag) tag_list = sorted(tag_set) print('there are {} tags.'.format(len(tag_list))) training_graphs = data_set.parsed_sents('train.conll') if las: print('Determining deprel labels...', end='') stdout.flush() deprel_set = set() for graph in training_graphs: for node in graph.nodes.values(): if node['address']: # not root deprel_set.add(node['rel']) deprel_list = sorted(deprel_set) print('there are {} deprel labels.'.format(len(deprel_list))) else: deprel_list = [] transducer = Transducer(word_list, tag_list, deprel_list) print('Getting training data...', end='') stdout.flush() training_data = TrainingIterable( training_graphs, transducer, max_batch_size=max_batch_size, las=las, transition_cache=transition_cache, seed=seed, ) # use training's rng to initialize null embedding word_embeddings[-1] = training_data.rng.uniform(-.01, .01, 50) print('there are {} samples.'.format(len(training_data))) print('Getting dev data...', end='') stdout.flush() dev_sentences = data_set.tagged_sents('dev.conll') dev_arcs = tuple( list(transducer.graph2arc(graph, include_deprel=las)) for graph in data_set.parsed_sents('dev.conll')) print('there are {} samples.'.format(len(dev_arcs))) print('Getting test data...', end='') stdout.flush() test_sentences = data_set.tagged_sents('test.conll') test_arcs = tuple( list(transducer.graph2arc(graph, include_deprel=las)) for graph in data_set.parsed_sents('test.conll')) print('there are {} samples.'.format(len(test_arcs))) return (transducer, word_embeddings, training_data, dev_sentences, dev_arcs, test_sentences, test_arcs)
def extract_gz_to_string(filename: str) -> str: gz = gz_open(filename, mode='rb') return gz.read().decode()
def deserialize(filename): with gz_open(filename, 'rb') as f: return load(f)
def serialize(obj, filename): with gz_open(filename, 'wb') as f: dump(obj, f, HIGHEST_PROTOCOL)