Esempio n. 1
0
 def testOpenDel(self):
     self.createTempFile()
     for i in range(10000):
         o = BZ2File(self.filename)
         del o
Esempio n. 2
0
 def testSeekBackwardsFromEnd(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(-150, 2)
         self.assertEqual(bz2f.read(), self.TEXT[len(self.TEXT) - 150:])
Esempio n. 3
0
 def testSeekPostEndMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(150000)
         self.assertEqual(bz2f.tell(), len(self.TEXT) * 5)
         self.assertEqual(bz2f.read(), b"")
Esempio n. 4
0
 def testWriteNonDefaultCompressLevel(self):
     expected = bz2.compress(self.TEXT, compresslevel=5)
     with BZ2File(self.filename, "w", compresslevel=5) as bz2f:
         bz2f.write(self.TEXT)
     with open(self.filename, "rb") as f:
         self.assertEqual(f.read(), expected)
Esempio n. 5
0
 def testSeekForwardAcrossStreams(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.seek)
         bz2f.seek(len(self.TEXT) + 150)
         self.assertEqual(bz2f.read(), self.TEXT[150:])
Esempio n. 6
0
 def testReadLinesMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.readlines, None)
         self.assertEqual(bz2f.readlines(), self.TEXT_LINES * 5)
Esempio n. 7
0
 def testIteratorMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(list(iter(bz2f)), self.TEXT_LINES * 5)
Esempio n. 8
0
File: web.py Progetto: dipsec/viper
def add_file():
    tags = request.forms.get('tag_list')
    uploads = request.files.getlist('file')

    # Set Project
    project = request.forms.get('project')
    if project in project_list():
        __project__.open(project)
    else:
        __project__.open('../')
        project = 'Main'
    db = Database()
    file_list = []
    # Write temp file to disk
    with upload_temp() as temp_dir:
        for upload in uploads:
            file_path = os.path.join(temp_dir, upload.filename)
            with open(file_path, 'w') as tmp_file:
                tmp_file.write(upload.file.read())
            # Zip Files
            if request.forms.get('compression') == 'zip':
                zip_pass = request.forms.get('zip_pass')
                try:
                    with ZipFile(file_path) as zf:
                        zf.extractall(temp_dir, pwd=zip_pass)
                    for root, dirs, files in os.walk(temp_dir, topdown=False):
                        for name in files:
                            if not name == upload.filename:
                                file_list.append(os.path.join(root, name))
                except Exception as e:
                    return template('error.tpl',
                                    error="Error with zipfile - {0}".format(e))
            # GZip Files
            elif request.forms.get('compression') == 'gz':
                try:
                    gzf = GzipFile(file_path, 'rb')
                    decompress = gzf.read()
                    gzf.close()
                    with open(file_path[:-3], "wb") as df:
                        df.write(decompress)
                    file_list.append(file_path[:-3])
                except Exception as e:
                    return template(
                        'error.tpl',
                        error="Error with gzipfile - {0}".format(e))
            # BZip2 Files
            elif request.forms.get('compression') == 'bz2':
                try:
                    bz2f = BZ2File(file_path, 'rb')
                    decompress = bz2f.read()
                    bz2f.close()
                    with open(file_path[:-3], "wb") as df:
                        df.write(decompress)
                    file_list.append(file_path[:-3])
                except Exception as e:
                    return template(
                        'error.tpl',
                        error="Error with bzip2file - {0}".format(e))
            # Tar Files (any, including tar.gz tar.bz2)
            elif request.forms.get('compression') == 'tar':
                try:
                    if not tarfile.is_tarfile(file_path):
                        return template('error.tpl',
                                        error="This is not a tar file")
                    with tarfile.open(file_path, 'r:*') as tarf:
                        tarf.extractall(temp_dir)
                    for root, dirs, files in os.walk(temp_dir, topdown=False):
                        for name in files:
                            if not name == upload.filename:
                                file_list.append(os.path.join(root, name))
                except Exception as e:
                    return template('error.tpl',
                                    error="Error with tarfile - {0}".format(e))
            # Non zip files
            elif request.forms.get('compression') == 'none':
                file_list.append(file_path)

        # Add each file
        for new_file in file_list:
            print new_file
            obj = File(new_file)
            new_path = store_sample(obj)
            success = True
            if new_path:
                # Add file to the database.
                success = db.add(obj=obj, tags=tags)
                if not success:
                    return template(
                        'error.tpl',
                        error="Unable to Store The File: {0}".format(
                            upload.filename))
    redirect("/project/{0}".format(project))
Esempio n. 9
0
 def testIterator(self):
     # "Test iter(BZ2File)"
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         sio = StringIO(self.TEXT)
         self.assertEqual(list(iter(bz2f)), sio.readlines())
Esempio n. 10
0
 def testSeekForwardBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             self.assertRaises(TypeError, bz2f.seek)
             bz2f.seek(150)
             self.assertEqual(bz2f.read(), self.TEXT[150:])
Esempio n. 11
0
 def testSeekBackwardsBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             bz2f.read(500)
             bz2f.seek(-150, 1)
             self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
Esempio n. 12
0
 def testReadBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             self.assertRaises(TypeError, bz2f.read, float())
             self.assertEqual(bz2f.read(), self.TEXT)
         self.assertFalse(bio.closed)
Esempio n. 13
0
 def testOpenPathLikeFilename(self):
     filename = pathlib.Path(self.filename)
     with BZ2File(filename, "wb") as f:
         f.write(self.DATA)
     with BZ2File(filename, "rb") as f:
         self.assertEqual(f.read(), self.DATA)
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', default='../../data/frequency')
    parser.add_argument('--comment_files', nargs='+', default=None)
    parser.add_argument('--n', type=int, default=2)
    parser.add_argument('--file_suffix', default=None)
    parser.add_argument('--sample_pct', type=float, default=100)
    args = parser.parse_args()
    out_dir = args.out_dir
    comment_files = args.comment_files
    n = args.n
    file_suffix = args.file_suffix
    sample_pct = args.sample_pct
    if (comment_files is None):
        comment_files = get_all_comment_files()
        # replace with clean normalized (smaller vocab)
        comment_files = [
            f.replace('.bz2', '_clean_normalized.bz2') for f in comment_files
        ]
    # start small
    # comment_files = comment_files[:1]
    # min_df = 5
    # min_tf = 10
    min_tf = 1
    stopwords = []
    tokenizer = WhitespaceTokenizer()
    # breaking memory
    # ngram_range = (1,3)
    # ngram_range = (2,3)
    # ngram_range = (2,2)
    # ngram_range = (1,1)
    # no CountVectorizer because memory and we don't need
    # cooccurrence anyway
    # cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize,
    #                      stop_words=stopwords, ngram_range=ngram_range)
    date_format = '201[0-9]-[0-9]+'
    for f in comment_files:
        print('processing file %s' % (f))
        date_str = re.findall(date_format, f)[0]
        # for each level of ngram, recompute counts
        # for n in range(ngram_range[0], ngram_range[1]+1):
        print('computing ngram = %d' % (n))
        with BZ2File(f, 'r') as comment_file:
            # takes too long to generate full DTM...what do??
            # just compute counts
            comment_iter = make_iter(comment_file)
            counts = get_ngram_counts(comment_iter,
                                      n,
                                      tokenizer=tokenizer,
                                      sample_pct=sample_pct)

            # limit min_frequency?
            counts = counts[counts >= min_tf]
            counts.columns = [date_str]
            # write to file
            # TOO MUCH SPACE => compress?
            if (file_suffix is not None):
                out_fname = os.path.join(
                    out_dir,
                    '%s_%dgram_tf_%s.tsv' % (date_str, n, file_suffix))
            else:
                out_fname = os.path.join(out_dir,
                                         '%s_%dgram_tf.tsv' % (date_str, n))
            counts.to_csv(out_fname, sep='\t')
Esempio n. 15
0
 def testRead100(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(bz2f.read(100), self.TEXT[:100])
Esempio n. 16
0
 def testOpenDel(self):
     # "Test opening and deleting a file many times"
     self.createTempFile()
     for i in xrange(10000):
         o = BZ2File(self.filename)
         del o
Esempio n. 17
0
 def testReadLine(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.readline, None)
         for line in self.TEXT_LINES:
             self.assertEqual(bz2f.readline(), line)
Esempio n. 18
0
 def testRead(self):
     # "Test BZ2File.read()"
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, None)
         self.assertEqual(bz2f.read(), self.TEXT)
Esempio n. 19
0
 def testIterator(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(list(iter(bz2f)), self.TEXT_LINES)
Esempio n. 20
0
 def testRead0(self):
     # Test BBZ2File.read(0)"
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, None)
         self.assertEqual(bz2f.read(0), "")
Esempio n. 21
0
 def testWrite(self):
     with BZ2File(self.filename, "w") as bz2f:
         self.assertRaises(TypeError, bz2f.write)
         bz2f.write(self.TEXT)
     with open(self.filename, 'rb') as f:
         self.assertEqual(ext_decompress(f.read()), self.TEXT)
Esempio n. 22
0
 def testReadBadFile(self):
     self.createTempFile(streams=0, suffix=self.BAD_DATA)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(OSError, bz2f.read)
Esempio n. 23
0
 def testSeekForward(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.seek)
         bz2f.seek(150)
         self.assertEqual(bz2f.read(), self.TEXT[150:])
Esempio n. 24
0
 def testReadMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, float())
         self.assertEqual(bz2f.read(), self.TEXT * 5)
Esempio n. 25
0
 def testSeekBackwards(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         bz2f.read(500)
         bz2f.seek(-150, 1)
         self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
Esempio n. 26
0
 def testReadMultiStreamTrailingJunk(self):
     self.createTempFile(streams=5, suffix=self.BAD_DATA)
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(bz2f.read(), self.TEXT * 5)
Esempio n. 27
0
 def testSeekBackwardsFromEndAcrossStreams(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(-1000, 2)
         self.assertEqual(bz2f.read(), (self.TEXT * 2)[-1000:])
Esempio n. 28
0
 def testRead0(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, float())
         self.assertEqual(bz2f.read(0), b"")
Esempio n. 29
0
 def testSeekPreStartMultiStream(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(-150)
         self.assertEqual(bz2f.tell(), 0)
         self.assertEqual(bz2f.read(), self.TEXT * 2)
Esempio n. 30
0
def demux_tar(filename, options):
    retlist = []
    ext = ""

    try:
        # only extract from files with no extension or with .bin (downloaded from us) or .tar/tarball extensions
        ext = os.path.splitext(filename)[1]
        if ext != "" and ext != ".tar" and ext != ".gz" and ext != ".tgz" and ext != ".bz2" and ext != ".tbz2" and ext != ".bin":
            return retlist

        extracted = []

        with tarfile.open(filename, "r") as archive:
            infolist = archive.getmembers()
            for info in infolist:
                # avoid obvious bombs
                if info.size > 100 * 1024 * 1024 or not info.size:
                    continue
                # ignore non-regular files
                if not info.isreg():
                    continue
                base, ext = os.path.splitext(info.name)
                basename = os.path.basename(info.name)
                ext = ext.lower()
                if ext == "" and len(basename) and basename[0] == ".":
                    continue
                for theext in demux_extensions_list:
                    if ext == theext:
                        extracted.append(info)
                        break

            if extracted:
                options = Config()
                tmp_path = options.cuckoo.get("tmppath", "/tmp")
                target_path = os.path.join(tmp_path, "cuckoo-tar-tmp")
                if not os.path.exists(target_path):
                    os.mkdir(target_path)
                tmp_dir = tempfile.mkdtemp(prefix='cuckootar_',
                                           dir=target_path)

                for extfile in extracted:
                    fobj = archive.extractfile(extfile)
                    outpath = os.path.join(tmp_dir, extfile.name)
                    outfile = open(outpath, "wb")
                    outfile.write(fobj.read())
                    fobj.close()
                    outfile.close()
                    retlist.append(outpath)
    except:
        if ext == ".tgz" or ext == ".tbz2" or ext == ".tar":
            return retlist
        # handle gzip
        try:
            gzfinal = os.path.basename(os.path.splitext(filename)[0])
            with gzip.open(filename, "rb") as fobj:
                options = Config()
                tmp_path = options.cuckoo.get("tmppath", "/tmp")
                target_path = os.path.join(tmp_path, "cuckoo-tar-tmp")
                if not os.path.exists(target_path):
                    os.mkdir(target_path)
                tmp_dir = tempfile.mkdtemp(prefix='cuckootar_',
                                           dir=target_path)
                outpath = os.path.join(tmp_dir, gzfinal)
                outfile = open(outpath, "wb")
                outfile.write(fobj.read())
                outfile.close()
            retlist.append(outpath)
        except:
            pass

        # handle bzip2
        try:
            gzfinal = os.path.basename(os.path.splitext(filename)[0])
            with BZ2File(filename, "rb") as fobj:
                options = Config()
                tmp_path = options.cuckoo.get("tmppath", "/tmp")
                target_path = os.path.join(tmp_path, "cuckoo-tar-tmp")
                if not os.path.exists(target_path):
                    os.mkdir(target_path)
                tmp_dir = tempfile.mkdtemp(prefix='cuckootar_',
                                           dir=target_path)
                outpath = os.path.join(tmp_dir, gzfinal)
                outfile = open(outpath, "wb")
                outfile.write(fobj.read())
                outfile.close()
            retlist.append(outpath)
        except:
            pass

    return retlist