def testOpenDel(self): self.createTempFile() for i in range(10000): o = BZ2File(self.filename) del o
def testSeekBackwardsFromEnd(self): self.createTempFile() with BZ2File(self.filename) as bz2f: bz2f.seek(-150, 2) self.assertEqual(bz2f.read(), self.TEXT[len(self.TEXT) - 150:])
def testSeekPostEndMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: bz2f.seek(150000) self.assertEqual(bz2f.tell(), len(self.TEXT) * 5) self.assertEqual(bz2f.read(), b"")
def testWriteNonDefaultCompressLevel(self): expected = bz2.compress(self.TEXT, compresslevel=5) with BZ2File(self.filename, "w", compresslevel=5) as bz2f: bz2f.write(self.TEXT) with open(self.filename, "rb") as f: self.assertEqual(f.read(), expected)
def testSeekForwardAcrossStreams(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.seek) bz2f.seek(len(self.TEXT) + 150) self.assertEqual(bz2f.read(), self.TEXT[150:])
def testReadLinesMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.readlines, None) self.assertEqual(bz2f.readlines(), self.TEXT_LINES * 5)
def testIteratorMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: self.assertEqual(list(iter(bz2f)), self.TEXT_LINES * 5)
def add_file(): tags = request.forms.get('tag_list') uploads = request.files.getlist('file') # Set Project project = request.forms.get('project') if project in project_list(): __project__.open(project) else: __project__.open('../') project = 'Main' db = Database() file_list = [] # Write temp file to disk with upload_temp() as temp_dir: for upload in uploads: file_path = os.path.join(temp_dir, upload.filename) with open(file_path, 'w') as tmp_file: tmp_file.write(upload.file.read()) # Zip Files if request.forms.get('compression') == 'zip': zip_pass = request.forms.get('zip_pass') try: with ZipFile(file_path) as zf: zf.extractall(temp_dir, pwd=zip_pass) for root, dirs, files in os.walk(temp_dir, topdown=False): for name in files: if not name == upload.filename: file_list.append(os.path.join(root, name)) except Exception as e: return template('error.tpl', error="Error with zipfile - {0}".format(e)) # GZip Files elif request.forms.get('compression') == 'gz': try: gzf = GzipFile(file_path, 'rb') decompress = gzf.read() gzf.close() with open(file_path[:-3], "wb") as df: df.write(decompress) file_list.append(file_path[:-3]) except Exception as e: return template( 'error.tpl', error="Error with gzipfile - {0}".format(e)) # BZip2 Files elif request.forms.get('compression') == 'bz2': try: bz2f = BZ2File(file_path, 'rb') decompress = bz2f.read() bz2f.close() with open(file_path[:-3], "wb") as df: df.write(decompress) file_list.append(file_path[:-3]) except Exception as e: return template( 'error.tpl', error="Error with bzip2file - {0}".format(e)) # Tar Files (any, including tar.gz tar.bz2) elif request.forms.get('compression') == 'tar': try: if not tarfile.is_tarfile(file_path): return template('error.tpl', error="This is not a tar file") with tarfile.open(file_path, 'r:*') as tarf: tarf.extractall(temp_dir) for root, dirs, files in os.walk(temp_dir, topdown=False): for name in files: if not name == upload.filename: file_list.append(os.path.join(root, name)) except Exception as e: return template('error.tpl', error="Error with tarfile - {0}".format(e)) # Non zip files elif request.forms.get('compression') == 'none': file_list.append(file_path) # Add each file for new_file in file_list: print new_file obj = File(new_file) new_path = store_sample(obj) success = True if new_path: # Add file to the database. success = db.add(obj=obj, tags=tags) if not success: return template( 'error.tpl', error="Unable to Store The File: {0}".format( upload.filename)) redirect("/project/{0}".format(project))
def testIterator(self): # "Test iter(BZ2File)" self.createTempFile() with BZ2File(self.filename) as bz2f: sio = StringIO(self.TEXT) self.assertEqual(list(iter(bz2f)), sio.readlines())
def testSeekForwardBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: self.assertRaises(TypeError, bz2f.seek) bz2f.seek(150) self.assertEqual(bz2f.read(), self.TEXT[150:])
def testSeekBackwardsBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: bz2f.read(500) bz2f.seek(-150, 1) self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
def testReadBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT) self.assertFalse(bio.closed)
def testOpenPathLikeFilename(self): filename = pathlib.Path(self.filename) with BZ2File(filename, "wb") as f: f.write(self.DATA) with BZ2File(filename, "rb") as f: self.assertEqual(f.read(), self.DATA)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out_dir', default='../../data/frequency') parser.add_argument('--comment_files', nargs='+', default=None) parser.add_argument('--n', type=int, default=2) parser.add_argument('--file_suffix', default=None) parser.add_argument('--sample_pct', type=float, default=100) args = parser.parse_args() out_dir = args.out_dir comment_files = args.comment_files n = args.n file_suffix = args.file_suffix sample_pct = args.sample_pct if (comment_files is None): comment_files = get_all_comment_files() # replace with clean normalized (smaller vocab) comment_files = [ f.replace('.bz2', '_clean_normalized.bz2') for f in comment_files ] # start small # comment_files = comment_files[:1] # min_df = 5 # min_tf = 10 min_tf = 1 stopwords = [] tokenizer = WhitespaceTokenizer() # breaking memory # ngram_range = (1,3) # ngram_range = (2,3) # ngram_range = (2,2) # ngram_range = (1,1) # no CountVectorizer because memory and we don't need # cooccurrence anyway # cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize, # stop_words=stopwords, ngram_range=ngram_range) date_format = '201[0-9]-[0-9]+' for f in comment_files: print('processing file %s' % (f)) date_str = re.findall(date_format, f)[0] # for each level of ngram, recompute counts # for n in range(ngram_range[0], ngram_range[1]+1): print('computing ngram = %d' % (n)) with BZ2File(f, 'r') as comment_file: # takes too long to generate full DTM...what do?? # just compute counts comment_iter = make_iter(comment_file) counts = get_ngram_counts(comment_iter, n, tokenizer=tokenizer, sample_pct=sample_pct) # limit min_frequency? counts = counts[counts >= min_tf] counts.columns = [date_str] # write to file # TOO MUCH SPACE => compress? if (file_suffix is not None): out_fname = os.path.join( out_dir, '%s_%dgram_tf_%s.tsv' % (date_str, n, file_suffix)) else: out_fname = os.path.join(out_dir, '%s_%dgram_tf.tsv' % (date_str, n)) counts.to_csv(out_fname, sep='\t')
def testRead100(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertEqual(bz2f.read(100), self.TEXT[:100])
def testOpenDel(self): # "Test opening and deleting a file many times" self.createTempFile() for i in xrange(10000): o = BZ2File(self.filename) del o
def testReadLine(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.readline, None) for line in self.TEXT_LINES: self.assertEqual(bz2f.readline(), line)
def testRead(self): # "Test BZ2File.read()" self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, None) self.assertEqual(bz2f.read(), self.TEXT)
def testIterator(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertEqual(list(iter(bz2f)), self.TEXT_LINES)
def testRead0(self): # Test BBZ2File.read(0)" self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, None) self.assertEqual(bz2f.read(0), "")
def testWrite(self): with BZ2File(self.filename, "w") as bz2f: self.assertRaises(TypeError, bz2f.write) bz2f.write(self.TEXT) with open(self.filename, 'rb') as f: self.assertEqual(ext_decompress(f.read()), self.TEXT)
def testReadBadFile(self): self.createTempFile(streams=0, suffix=self.BAD_DATA) with BZ2File(self.filename) as bz2f: self.assertRaises(OSError, bz2f.read)
def testSeekForward(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.seek) bz2f.seek(150) self.assertEqual(bz2f.read(), self.TEXT[150:])
def testReadMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT * 5)
def testSeekBackwards(self): self.createTempFile() with BZ2File(self.filename) as bz2f: bz2f.read(500) bz2f.seek(-150, 1) self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
def testReadMultiStreamTrailingJunk(self): self.createTempFile(streams=5, suffix=self.BAD_DATA) with BZ2File(self.filename) as bz2f: self.assertEqual(bz2f.read(), self.TEXT * 5)
def testSeekBackwardsFromEndAcrossStreams(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: bz2f.seek(-1000, 2) self.assertEqual(bz2f.read(), (self.TEXT * 2)[-1000:])
def testRead0(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(0), b"")
def testSeekPreStartMultiStream(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: bz2f.seek(-150) self.assertEqual(bz2f.tell(), 0) self.assertEqual(bz2f.read(), self.TEXT * 2)
def demux_tar(filename, options): retlist = [] ext = "" try: # only extract from files with no extension or with .bin (downloaded from us) or .tar/tarball extensions ext = os.path.splitext(filename)[1] if ext != "" and ext != ".tar" and ext != ".gz" and ext != ".tgz" and ext != ".bz2" and ext != ".tbz2" and ext != ".bin": return retlist extracted = [] with tarfile.open(filename, "r") as archive: infolist = archive.getmembers() for info in infolist: # avoid obvious bombs if info.size > 100 * 1024 * 1024 or not info.size: continue # ignore non-regular files if not info.isreg(): continue base, ext = os.path.splitext(info.name) basename = os.path.basename(info.name) ext = ext.lower() if ext == "" and len(basename) and basename[0] == ".": continue for theext in demux_extensions_list: if ext == theext: extracted.append(info) break if extracted: options = Config() tmp_path = options.cuckoo.get("tmppath", "/tmp") target_path = os.path.join(tmp_path, "cuckoo-tar-tmp") if not os.path.exists(target_path): os.mkdir(target_path) tmp_dir = tempfile.mkdtemp(prefix='cuckootar_', dir=target_path) for extfile in extracted: fobj = archive.extractfile(extfile) outpath = os.path.join(tmp_dir, extfile.name) outfile = open(outpath, "wb") outfile.write(fobj.read()) fobj.close() outfile.close() retlist.append(outpath) except: if ext == ".tgz" or ext == ".tbz2" or ext == ".tar": return retlist # handle gzip try: gzfinal = os.path.basename(os.path.splitext(filename)[0]) with gzip.open(filename, "rb") as fobj: options = Config() tmp_path = options.cuckoo.get("tmppath", "/tmp") target_path = os.path.join(tmp_path, "cuckoo-tar-tmp") if not os.path.exists(target_path): os.mkdir(target_path) tmp_dir = tempfile.mkdtemp(prefix='cuckootar_', dir=target_path) outpath = os.path.join(tmp_dir, gzfinal) outfile = open(outpath, "wb") outfile.write(fobj.read()) outfile.close() retlist.append(outpath) except: pass # handle bzip2 try: gzfinal = os.path.basename(os.path.splitext(filename)[0]) with BZ2File(filename, "rb") as fobj: options = Config() tmp_path = options.cuckoo.get("tmppath", "/tmp") target_path = os.path.join(tmp_path, "cuckoo-tar-tmp") if not os.path.exists(target_path): os.mkdir(target_path) tmp_dir = tempfile.mkdtemp(prefix='cuckootar_', dir=target_path) outpath = os.path.join(tmp_dir, gzfinal) outfile = open(outpath, "wb") outfile.write(fobj.read()) outfile.close() retlist.append(outpath) except: pass return retlist