Ejemplo n.º 1
0
    def test_fix_asa(self, mock_call):
        self._touch_files()
        # For the ASA the protocols get fixed
        flow_line = ('198.22.253.72,192.168.207.199,80,61391,6,'
                     '58,1,1459535021,1459535021\n')
        fixable_line = ('192.168.207.199,198.22.253.72,61391,80,0,'
                        '59,2,1459535022,1459535022\n')
        fixed_line = ('192.168.207.199,198.22.253.72,61391,80,6,'
                      '59,2,1459535022,1459535022\n')
        unfixable_line = ('192.168.207.200,198.22.253.72,61391,80,0,'
                          '59,2,1459535022,1459535022\n')

        def side_effect(*args, **kwargs):
            if 'rwuniq' not in args[0][0]:
                return 0
            with io.open(args[0][14], 'wt') as f:
                f.write(flow_line)
                f.write(fixable_line)
                f.write(unfixable_line)
            return 0

        mock_call.side_effect = side_effect

        env_override = {'OBSRVBL_IPFIX_PROBE_1_SOURCE': 'asa'}
        with patch.dict('ona_service.ipfix_pusher.environ', env_override):
            inst = self._get_instance(IPFIXPusher)
            input_paths = [join(self.input_dir, x) for x in self.ready[0:1]]
            inst._process_files(input_paths)

        with gz_open(input_paths[0], 'rt') as infile:
            lines = infile.readlines()
            self.assertEqual(lines[0], CSV_HEADER + '\n')
            self.assertEqual(lines[1], flow_line)
            self.assertEqual(lines[2], fixed_line)
            self.assertEqual(lines[3], unfixable_line)
Ejemplo n.º 2
0
    def test_fix_sonicwall(self, mock_call):
        self._touch_files()
        # For SonicWALL the timestamps get replaced
        flow_line = ('198.22.253.72,192.168.207.199,80,61391,6,'
                     '58,1,1459535021,1459535021\n')
        altered_line = ('198.22.253.72,192.168.207.199,80,61391,6,'
                        '58,1,1395669540,1395669540\n')

        def side_effect(*args, **kwargs):
            if 'rwuniq' not in args[0][0]:
                return 0
            with io.open(args[0][14], 'wt') as f:
                f.write(flow_line)
            return 0

        mock_call.side_effect = side_effect

        env_override = {'OBSRVBL_IPFIX_PROBE_1_SOURCE': 'sonicwall'}
        with patch.dict('ona_service.ipfix_pusher.environ', env_override):
            inst = self._get_instance(IPFIXPusher)
            input_paths = [join(self.input_dir, x) for x in self.ready[1:2]]
            inst._process_files(input_paths)

        with gz_open(input_paths[0], 'rt') as infile:
            lines = infile.readlines()
            self.assertEqual(lines[0], CSV_HEADER + '\n')
            self.assertEqual(lines[1], altered_line)
Ejemplo n.º 3
0
def extract_gz(file: str, path: str) -> str:
    outfile = os_path.join(path, os_path.basename(file[:-3]))
    makedirs(path, exist_ok=True)
    with gz_open(file, 'rb') as f_in:
        with open(outfile, 'wb') as f_out:
            copyfileobj(f_in, f_out)
    return outfile
Ejemplo n.º 4
0
    def download_file(self, name):
        """Download file from GitHub archive.
        :param name: name of GitHub archive in format YYYY-MM-DD-h
        :return: name of JSON file with data if downloading was successful else None
        """
        #TODO: handle exceptions
        archive_name = name + ".json.gz"
        file_name = join(self.new_data_dir, name + ".json")

        try:
            urlretrieve("http://data.githubarchive.org/" + archive_name,
                        filename=join(self.downloaded_data_dir, archive_name))
        except IOError:
            self.logger.error(__name__ + ": " + "unable to download file (error creating connection).")

        try:
            archive = gz_open(join(self.downloaded_data_dir, archive_name))
        except IOError:
            self.logger.error(__name__ + ": " + "unable to open gzipped file (file not created).")
        else:
            json_file = open(file_name, "w")
            json_file.write(archive.read())

            archive.close()
            json_file.close()

            remove(join(self.downloaded_data_dir, archive_name))

            return file_name
Ejemplo n.º 5
0
 def _read_file(self, key):
     resp = self.boto_client.get_object(Bucket=self.bucket, Key=key)
     with gz_open(resp['Body'], mode='rt') as gz_f:
         reader = DictReader(gz_f, delimiter=' ')
         reader.fieldnames = [
             f.replace('-', '_') for f in reader.fieldnames
         ]
         yield from reader
Ejemplo n.º 6
0
def download(url, file_path):
    r = get(url, stream=True)
    with open(file_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=1024):
            fd.write(chunk)
    mrc_path = file_path.replace('.gz', '.mrc')
    with gz_open(file_path, 'rb') as f_in:
        with open(mrc_path, 'wb') as f_out:
            copyfileobj(f_in, f_out)
    remove(file_path)
Ejemplo n.º 7
0
 def _read_file(self, key):
     resp = self.boto_client.get_object(Bucket=self.bucket, Key=key)
     with gz_open(resp['Body'], mode='rt') as gz_f:
         reader = DictReader(gz_f, delimiter=' ')
         reader.fieldnames = [
             f.replace('-', '_') for f in reader.fieldnames
         ]
         yield from reader
         with THREAD_LOCK:
             self.bytes_processed += gz_f.tell()
             self.compressed_bytes_processed += resp['ContentLength']
Ejemplo n.º 8
0
def load_data_frame(file_path: str) -> DataFrame:
    logger.info("Reading file `%s`", file_path)

    if file_path.endswith(".csv.gz"):
        with gz_open(file_path) as file:
            return read_csv(file)
    elif file_path.endswith(".csv"):
        return read_csv(file_path)
    elif file_path.endswith(".gitkeep"):
        return DataFrame()

    raise RuntimeError(f"Unrecognizable file type: {file_path}")
Ejemplo n.º 9
0
 def _process_files(self, file_list):
     for file_path in file_list:
         file_dir, file_name = split(file_path)
         temp_path = join(file_dir, '{}.tmp'.format(file_name))
         copy(file_path, temp_path)
         try:
             all_rows = self._get_nvz_flows(temp_path)
             with gz_open(file_path, 'wt') as outfile:
                 writer = DictWriter(outfile, CSV_HEADER)
                 writer.writeheader()
                 writer.writerows(all_rows)
         finally:
             remove(temp_path)
Ejemplo n.º 10
0
    def rotate(self):
        log_file = self.log_file

        # We save in a file ?
        if log_file is None:
            return False

        # Save the current log
        # XXX In a multithreads context, we must add a lock here
        new_name = log_file + '.' + strftime('%Y-%m-%d_%H%M')
        # We don't delete an existing file
        if exists(new_name + '.gz'):
            # If here, interval < 1min
            return self.rotate_interval
        # Yet a log file ?
        if exists(log_file):
            # Yes, we move it
            move(log_file, new_name)
        else:
            # No, we create an empty one
            open(new_name, 'w').close()

        # Compress it
        f_in = open(new_name, 'rb')
        f_out = gz_open(new_name + '.gz', 'wb')
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
        remove(new_name)

        # Suppress the old files
        files = []
        n2 = '[0-9][0-9]'
        date_pattern = n2 + n2 + '-' + n2 + '-' + n2 + '_' + n2 + n2
        for name in glob(log_file + '.' + date_pattern + '.gz'):
            try:
                date = datetime.strptime(name[-18:-3], '%Y-%m-%d_%H%M')
            except ValueError:
                continue
            files.append((date, name))
        files.sort(reverse=True)
        for a_file in files[LOG_FILES_NUMBER:]:
            remove(a_file[1])

        # We return always True to be "cron" compliant
        return self.rotate_interval
Ejemplo n.º 11
0
    def rotate(self):
        log_file = self.log_file

        # We save in a file ?
        if log_file is None:
            return False

        # Save the current log
        # XXX In a multithreads context, we must add a lock here
        new_name = log_file + '.' + strftime('%Y-%m-%d_%H%M')
        # We don't delete an existing file
        if exists(new_name + '.gz'):
            # If here, interval < 1min
            return self.rotate_interval
        # Yet a log file ?
        if exists(log_file):
            # Yes, we move it
            move(log_file, new_name)
        else:
            # No, we create an empty one
            open(new_name, 'w').close()

        # Compress it
        f_in = open(new_name, 'rb')
        f_out = gz_open(new_name + '.gz', 'wb')
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
        remove(new_name)

        # Suppress the old files
        files = []
        n2 = '[0-9][0-9]'
        date_pattern = n2 + n2 + '-' + n2 + '-' + n2 + '_' + n2 + n2
        for name in glob(log_file + '.' + date_pattern + '.gz'):
            try:
                date = datetime.strptime(name[-18:-3], '%Y-%m-%d_%H%M')
            except ValueError:
                continue
            files.append( (date, name) )
        files.sort(reverse=True)
        for a_file in files[LOG_FILES_NUMBER:]:
            remove(a_file[1])

        # We return always True to be "cron" compliant
        return self.rotate_interval
Ejemplo n.º 12
0
    def _check_point_to_csv(self, send_segment, now):
        # Writes files to the "input" directory so the pusher will find them,
        # archive them, and send them out.

        # The input directory may not have been created yet
        create_dirs(self.input_dir)

        segment_data = self.log_node.parsed_data.pop(send_segment, [])
        if not segment_data:
            return

        file_name = '{}_{}.csv.gz'.format(send_segment.strftime(self.file_fmt),
                                          now.strftime(self.file_fmt))
        file_path = join(self.input_dir, file_name)
        with gz_open(file_path, 'wt') as outfile:
            writer = DictWriter(outfile, CSV_HEADER)
            writer.writeheader()
            writer.writerows(self._format_item(x) for x in segment_data)
Ejemplo n.º 13
0
    def test_fix_meraki(self, mock_call):
        self._touch_files()
        # Intermediate report - will get subsumed by the next line
        flow_line_1 = ('198.22.253.72,192.168.207.199,80,61391,6,'
                       '58,1,1459535021,1459535021\n')
        # Final report before reset - this will show up in the output
        flow_line_2 = ('198.22.253.72,192.168.207.199,80,61391,6,'
                       '158,1,1459535021,1459535021\n')
        # After the reset - this won't show up
        flow_line_3 = ('198.22.253.72,192.168.207.199,80,61391,6,'
                       '157,1,1459535021,1459535021\n')
        # This one shows up because it's the last report
        flow_line_4 = ('198.22.253.72,192.168.207.199,80,61391,6,'
                       '159,1,1459535021,1459535021\n')

        altered_line_1 = ('192.168.207.199,198.22.253.72,61391,80,6,'
                          '158,1,1395669540,1395669540\n')
        altered_line_2 = ('192.168.207.199,198.22.253.72,61391,80,6,'
                          '159,1,1395669540,1395669540\n')

        def side_effect(*args, **kwargs):
            if 'rwcut' not in args[0][0]:
                return 0
            with io.open(args[0][11], 'wt') as f:
                f.write(flow_line_1)
                f.write(flow_line_2)
                f.write(flow_line_3)
                f.write(flow_line_4)
            return 0

        mock_call.side_effect = side_effect

        env_override = {'OBSRVBL_IPFIX_PROBE_1_SOURCE': 'meraki'}
        with patch.dict('ona_service.ipfix_pusher.environ', env_override):
            inst = self._get_instance(IPFIXPusher)
            input_paths = [join(self.input_dir, x) for x in self.ready[1:2]]
            inst._process_files(input_paths)

        with gz_open(input_paths[0], 'rt') as infile:
            lines = infile.readlines()
            self.assertEqual(len(lines), 1 + 2)  # Header + Rows
            self.assertEqual(lines[0], CSV_HEADER + '\n')
            self.assertEqual(lines[1], altered_line_1)
            self.assertEqual(lines[2], altered_line_2)
Ejemplo n.º 14
0
 def _read_file(self, key):
     resp = self.boto_client.get_object(Bucket=self.bucket, Key=key)
     if key.endswith('.parquet'):
         body = resp['Body'].read()
         reader = parquet_dict_reader(io.BytesIO(body))
         yield from reader
         with THREAD_LOCK:
             self.bytes_processed += len(body)
             self.compressed_bytes_processed += resp['ContentLength']
     else:
         with gz_open(resp['Body'], mode='rt') as gz_f:
             reader = csv_dict_reader(gz_f, delimiter=' ')
             reader.fieldnames = [
                 f.replace('-', '_') for f in reader.fieldnames
             ]
             yield from reader
             with THREAD_LOCK:
                 self.bytes_processed += gz_f.tell()
                 self.compressed_bytes_processed += resp['ContentLength']
Ejemplo n.º 15
0
def compress_gz(inFile: str, outFile: str = '', delete: bool = False) -> str:
    """
    Compress 'inFile' into gzip format.

    Parameters:
    inFile  -- file to compress
    outFile -- (Optional) path of compressed file (default: ""), otherwise file is compressed in place
    delete  -- (Optional) the original path is deleted after compression (default: False)

    Returns:
    The archive filename
    """
    if not outFile:
        outFile = inFile + '.gz'
    with open(inFile, 'rb') as f_in, gz_open(outFile, 'wb') as f_out:
        f_out.writelines(f_in)
    if delete:
        remove(inFile)

    return outFile
Ejemplo n.º 16
0
def main():
#    fr_type = {}
#    fr_type['fasta'] = read_fasta
#    fr_type['fastq'] = read_fastq
#    sequence_read_fp = sys.argv[1]
    sequence_read_fp = args['input']
    fw = open(args["output"],'w')
    if sequence_read_fp.endswith('.gz'):
        sequence_read_f = gz_open(sequence_read_fp, 'rb')
    else:
        sequence_read_f = open(sequence_read_fp, 'U')
    if args['file_type'] == 'fastq':
        for seq_name, seq_base ,seq_qual in read_fastq(sequence_read_f):
#            print "%s\n%s\n%s" %(seq_name,seq_base,seq_qual)
            fw.write("%s\n%s\n%s" %(seq_name,seq_base,seq_qual)+"\n")
    if args['file_type'] == 'fasta':
        for seq_name, seq_base in read_fasta(sequence_read_f):
#            print ">%s\n%s" %(seq_name,seq_base)
            fw.write(">%s\n%s" %(seq_name,seq_base)+"\n")
    sequence_read_f.close()
    fw.close()
Ejemplo n.º 17
0
    def test_csv_header(self, mock_call):
        self._touch_files()
        flow_line = ('198.22.253.72,192.168.207.199,80,61391,6,'
                     '58,1,1459535021,1459535021\n')

        def side_effect(*args, **kwargs):
            if 'rwuniq' not in args[0][0]:
                return 0
            with io.open(args[0][14], 'wt') as f:
                f.write(flow_line)
            return 0

        mock_call.side_effect = side_effect

        inst = self._get_instance(IPFIXPusher)

        input_paths = [join(self.input_dir, x) for x in self.ready[0:1]]
        inst._process_files(input_paths)

        with gz_open(input_paths[0], 'rt') as infile:
            lines = infile.readlines()
            self.assertEqual(lines[0], CSV_HEADER + '\n')
            self.assertEqual(lines[1], flow_line)
Ejemplo n.º 18
0
    def test_process_files(self):
        # Write some test data
        file_path = join(self.input_dir, 'nvzflow.log')
        with io.open(file_path, 'wt') as f:
            for line in LOG_DATA:
                print(json.dumps(line).decode('utf-8'), file=f)

        # Process it
        inst = self._get_instance(NVZFlowPusher)
        inst._process_files([file_path])

        # It should have turned from JSON-object-per-line to CSV with header
        with gz_open(file_path, 'rt') as f:
            actual = f.read()
        expected = ('srcaddr,dstaddr,srcport,dstport,protocol,'
                    'bytes_in,bytes_out,start,end\r\n'
                    '2001:db8::8e18,2001:db8::100a,58572,53,17,'
                    '0,0,1522955857,1522955857\r\n'
                    '192.0.2.29,198.51.100.50,56209,443,6,'
                    '5113,639,1522955826,1522955857\r\n'
                    '0.0.0.0,198.51.100.50,0,443,6,'
                    '0,0,1522955857,1522955857\r\n')
        self.assertEqual(actual, expected)
Ejemplo n.º 19
0
def load_and_preprocess_data(data_set=mystery,
                             word_embedding_path='word2vec.pkl.gz',
                             las=True, max_batch_size=2048,
                             transition_cache=None, seed=1234):
    """Get train/test data

    See TrainingIterable for description of args

    Returns:
         a tuple of
         - a Transducer object
         - a word embedding matrix
         - a training data iterable
         - an iterable over dev sentences
         - an iterable over dev dependencies (arcs)
         - an iterable over test sentences
         - an iterable over test dependencies (arcs)
    """
    print('Loading word embeddings...')
    stdout.flush()
    if word_embedding_path.endswith('.gz'):
        with gz_open(word_embedding_path, 'rb') as file_obj:
            word_list, word_embeddings = load(file_obj)
    else:
        with open(word_embedding_path, 'rb') as file_obj:
            word_list, word_embeddings = load(file_obj)
    # add null embedding (we'll initialize later)
    word_embeddings = np.append(
        word_embeddings,
        np.empty((1, word_embeddings.shape[1]), dtype=np.float32),
        axis=0
        )
    print('There are {} word embeddings.'.format(word_embeddings.shape[0]))

    print('Getting training sentences...')
    stdout.flush()
    training_graphs = data_set.parsed_sents('train.conll')
    metadata_path = Path(data_set.root.path) / 'meta.pkl'
    trn_sent, trn_ex = load_metadata(metadata_path)
    tag_set, deprel_set = set(), set()
    with tqdm(total=trn_sent or None, leave=False, unit='sent') as progbar:
        for graph in training_graphs:
            for node in graph.nodes.values():
                if node['address']:  # not root
                    tag_set.add(node['ctag'])
                    deprel_set.add(node['rel'])
            progbar.update()
    tag_list = sorted(tag_set)
    if las:
        deprel_list = sorted(deprel_set)
        status = 'There are {} tags and {} deprel labels'
        status = status.format(len(tag_list), len(deprel_list))
    else:
        deprel_list = []
        status = 'There are {} tags'
        status = status.format(len(tag_list))
    transducer = Transducer(word_list, tag_list, deprel_list)
    training_data = TrainingIterable(training_graphs, transducer, seed=seed,
                                     max_batch_size=max_batch_size, las=las,
                                     transition_cache=transition_cache,
                                     n_ex=trn_ex)
    # use training's rng to initialize null embedding
    word_embeddings[-1] = training_data.rng.uniform(-.01, .01, 50)
    print(status,
          'from {} training sentences.'.format(training_data.graphs_len))
    save_metadata(metadata_path, len(training_graphs), len(training_data))

    print('Getting dev sentences...')
    stdout.flush()
    dev_sentences = data_set.tagged_sents('dev.conll')
    dev_arcs = tuple(list(transducer.graph2arc(graph, include_deprel=las))
                     for graph in tqdm(data_set.parsed_sents('dev.conll'),
                                       leave=False, unit='sent')
                     )
    print('There are {} dev sentences.'.format(len(dev_arcs)))

    print('Getting test sentences...')
    stdout.flush()
    test_sentences = data_set.tagged_sents('test.conll')
    test_arcs = tuple(list(transducer.graph2arc(graph, include_deprel=las))
                      for graph in tqdm(data_set.parsed_sents('test.conll'),
                                        leave=False, unit='sent')
                      )
    print('There are {} test sentences.'.format(len(test_arcs)))
    return (transducer, word_embeddings, training_data, dev_sentences,
            dev_arcs, test_sentences, test_arcs)
Ejemplo n.º 20
0
def load_and_preprocess_data(data_set=ud_english,
                             word_embedding_path='word2vec.pkl.gz',
                             las=True,
                             max_batch_size=2048,
                             transition_cache=None,
                             seed=1234):
    '''Get train/test data

    See TrainingIterable for description of args

    Returns:
         a tuple of
         - a Transducer object
         - a word embedding matrix
         - a training data iterable
         - an iterable over dev sentences
         - an iterable over dev dependencies (arcs)
         - an iterable over test sentences
         - an iterable over test dependencies (arcs)
    '''
    print('Loading word embeddings...', end='')
    stdout.flush()
    if word_embedding_path.endswith('.gz'):
        with gz_open(word_embedding_path, 'rb') as file_obj:
            word_list, word_embeddings = load(file_obj)
    else:
        with open(word_embedding_path, 'rb') as file_obj:
            word_list, word_embeddings = load(file_obj)
    # add null embedding (we'll initialize later)
    word_embeddings = np.append(word_embeddings,
                                np.empty((1, word_embeddings.shape[1]),
                                         dtype=np.float32),
                                axis=0)
    print('there are {} word embeddings.'.format(word_embeddings.shape[0]))
    print('Determining POS tags...', end='')
    stdout.flush()
    tag_set = set()
    for sentence in data_set.tagged_sents('train.conll'):
        for _, tag in sentence:
            tag_set.add(tag)
    tag_list = sorted(tag_set)
    print('there are {} tags.'.format(len(tag_list)))
    training_graphs = data_set.parsed_sents('train.conll')
    if las:
        print('Determining deprel labels...', end='')
        stdout.flush()
        deprel_set = set()
        for graph in training_graphs:
            for node in graph.nodes.values():
                if node['address']:  # not root
                    deprel_set.add(node['rel'])
        deprel_list = sorted(deprel_set)
        print('there are {} deprel labels.'.format(len(deprel_list)))
    else:
        deprel_list = []
    transducer = Transducer(word_list, tag_list, deprel_list)
    print('Getting training data...', end='')
    stdout.flush()
    training_data = TrainingIterable(
        training_graphs,
        transducer,
        max_batch_size=max_batch_size,
        las=las,
        transition_cache=transition_cache,
        seed=seed,
    )
    # use training's rng to initialize null embedding
    word_embeddings[-1] = training_data.rng.uniform(-.01, .01, 50)
    print('there are {} samples.'.format(len(training_data)))
    print('Getting dev data...', end='')
    stdout.flush()
    dev_sentences = data_set.tagged_sents('dev.conll')
    dev_arcs = tuple(
        list(transducer.graph2arc(graph, include_deprel=las))
        for graph in data_set.parsed_sents('dev.conll'))
    print('there are {} samples.'.format(len(dev_arcs)))
    print('Getting test data...', end='')
    stdout.flush()
    test_sentences = data_set.tagged_sents('test.conll')
    test_arcs = tuple(
        list(transducer.graph2arc(graph, include_deprel=las))
        for graph in data_set.parsed_sents('test.conll'))
    print('there are {} samples.'.format(len(test_arcs)))
    return (transducer, word_embeddings, training_data, dev_sentences,
            dev_arcs, test_sentences, test_arcs)
Ejemplo n.º 21
0
def extract_gz_to_string(filename: str) -> str:
    gz = gz_open(filename, mode='rb')
    return gz.read().decode()
Ejemplo n.º 22
0
def deserialize(filename):
    with gz_open(filename, 'rb') as f:
        return load(f)
Ejemplo n.º 23
0
def serialize(obj, filename):
    with gz_open(filename, 'wb') as f:
        dump(obj, f, HIGHEST_PROTOCOL)