def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None, compresslevel=9, format=None, check=-1, preset=None, filters=None, compression=None): # pylint: disable=unexpected-keyword-arg,no-member if compression is None: return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline) elif compression in ('gz', 'gzip'): if six.PY2: return gzip.open(path, mode=mode, compresslevel=compresslevel) else: return gzip.open(path, mode=mode, compresslevel=compresslevel, errors=errors, newline=newline, encoding=encoding) elif compression in ('lzma', 'xz'): try: import lzma except ImportError: from backports import lzma return lzma.open(path, mode=mode, format=format, check=check, preset=preset, filters=filters, encoding=encoding, errors=errors, newline=newline) elif compression == 'bz2': if six.PY2 or '__pypy__' in sys.builtin_module_names: import bz2file as bz2 # pylint: disable=import-error else: import bz2 return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding, errors=errors, newline=newline) else: raise ValueError( 'compression must be None, gz, gzip, lzma, or xz and was {0}'.format(compression))
def bz2(self): """ bz2 type """ if self.format = "bz2": with bz2file.open(self.path,'r') as F: for line in F: line = line.decode('utf-8') print (line)
def open_files(): wiki_bz_file = bz2file.open(config.wiki_file, mode="rt", encoding='utf8') output_file = open(config.parsed_names_and_alternats_file, mode="w", encoding='utf8') return wiki_bz_file, output_file
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None, compresslevel=9, format=None, check=-1, preset=None, filters=None, compression=None): # pylint: disable=unexpected-keyword-arg,no-member if compression is None: return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline) elif compression == 'gz' or compression == 'gzip': if six.PY2: return gzip.open(path, mode=mode, compresslevel=compresslevel) else: return gzip.open(path, mode=mode, compresslevel=compresslevel, errors=errors, newline=newline, encoding=encoding) elif compression == 'lzma' or compression == 'xz': try: import lzma except ImportError: from backports import lzma return lzma.open(path, mode=mode, format=format, check=check, preset=preset, filters=filters, encoding=encoding, errors=errors, newline=newline) elif compression == 'bz2': if six.PY2 or '__pypy__' in sys.builtin_module_names: import bz2file as bz2 # pylint: disable=import-error else: import bz2 return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding, errors=errors, newline=newline) else: raise ValueError( 'compression must be None, gz, gzip, lzma, or xz and was {0}'. format(compression))
def get_wiki2(): reobj1 = re.compile( r"[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+") reobj2 = re.compile(r"\n+") reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}") reuseful = re.compile('^[a-zA-Z]+:') redirect = re.compile(r"^#") def wiki_replace(s): s = filter_wiki(s) s = reobj1.sub("", s) # 为上传阿里云剔除竖线(|)符号 s = reobj2.sub("#", s) s = reobj3.sub("", s) return s wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2')) with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f: i = 0 filelist = [] for d in tqdm(wiki): if not reuseful.findall(d[0]) and not redirect.findall(d[1]): i += 1 filelist.append( reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n") if i % 1000 == 0: s = ("".join(filelist)) f.write(s) filelist = [] if filelist: s = ("".join(filelist)) f.write(s)
def get_file(self): """ Downloads the resource if necessary and opens an uncompressed stream for reading. """ local_filename = download.retrieve(self) return bz2.open(local_filename, mode='rt')
def preprocess_sentence(): i = 0 line = '' wiki = extract_pages( bz2file.open('./zhwiki-20190301-pages-articles.xml.bz2') ) # 用gensim的extract_pages来提取每个页面 with open('./zhwiki_sentence.txt', 'w') as f: for text in wiki: if not re.findall('^[a-zA-Z]+:', text[0]) and text[0] and not re.findall( u'^#', text[1]): # 去掉帮助页面以及重定向的页面 converted = opencc.convert(text[1]).strip() # 繁体转简体 converted = re.sub('\|\w*\]', '', converted) for x in converted: if len(x.encode( 'utf-8')) == 3 and x not in stop_punctuation( './stop_punctuation.txt'): line += x if x in ['\n', '。', '?', '!', ',', ';', ':' ] and line != '\n': # 以部分中文符号为分割换行 f.write(line.strip() + '\n') # 按行存入语料文件 line = '' i += 1 if i == 10: print("选取中文维基百科的文章篇数:", i) break
def decomp_burrows_wheeler(filename): start_time = time.time() outfilename = filename + '.bz' try: with open(filename, 'rb') as f, bz2file.open(outfilename, 'wb') as out: out.write(bz2.decompress(bytes(f.read()))) finally: out.close()
def tweet_num_bz2(file_list): import bz2file count = list() for file in file_list: fp = bz2file.open(file, 'r') count.append(len(fp.readlines())) fp.close() return count
def read(self): with bz2.open(self.path, mode=self.mode, compresslevel=self.compresslevel, encoding=self.encoding, errors=self.errors, newline=self.newline) as file_content: return file_content.read()
def _open_bz2(filename, mode): if bz2 is None: raise ImportError("Cannot open bz2 files: The bz2 module is not available") if _PY3: return bz2.open(filename, mode) else: if mode[0] == 'a': raise ValueError("Mode '{}' not supported with BZ2 compression".format(mode)) return bz2.BZ2File(filename, mode)
def get_wiki(): from opencc import OpenCC # 参考这篇博客注释 # https://kexue.fm/archives/4176 opencc1 = OpenCC("t2s") resub1 = re.compile(':*{\|[\s\S]*?\|}') resub2 = re.compile('<gallery>[\s\S]*?</gallery>') resub3 = re.compile('(.){{([^{}\n]*?\|[^{}\n]*?)}}') resub4 = re.compile('\* *\n|\'{2,}') resub5 = re.compile('\n+') resub6 = re.compile('\n[:;]|\n +') resub7 = re.compile('\n==') refind1 = re.compile('^[a-zA-Z]+:') refind2 = re.compile('^#') p1 = re.compile(r'-\{.*?(zh-hans|zh-cn):([^;]*?)(;.*?)?\}-') p2 = re.compile(r'[(\(][,;。?!\s]*[)\)]') p3 = re.compile(r'[「『]') p4 = re.compile(r'[」』]') def wiki_replace(s): s = filter_wiki(s) s = resub1.sub('', s) s = resub2.sub('', s) s = resub3.sub('\\1[[\\2]]', s) s = resub4.sub('', s) s = resub5.sub('\n', s) s = resub6.sub('\n', s) s = resub7.sub('\n\n==', s) s = p1.sub(r'\2', s) s = p2.sub(r'', s) s = p3.sub(r'“', s) s = p4.sub(r'”', s) return opencc1.convert(s).strip() wiki = extract_pages( bz2file.open( r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2' )) # wiki=WikiCorpus('zhwiki-latest-pages-articles.xml.bz2',lemmatize=False,dictionary={}) with codecs.open('wiki.csv', 'w', encoding='utf-8') as f: i = 0 filelist = [] for d in tqdm(wiki): if not refind1.findall(d[0]) and d[0] and not refind2.findall( d[1]): filelist.append(d[0] + "\n" + d[1]) line = d[1] i += 1 if i % 100 == 0: s = wiki_replace("\n\n".join(filelist)) f.write(s) filelist = []
def __iter__(self): with bz2.open(self.path, mode=self.mode, compresslevel=self.compresslevel, encoding=self.encoding, errors=self.errors, newline=self.newline) as file_content: for line in file_content: yield line
def burrows_wheeler(filename, level): start_time = time.time() outfilename = filename + '.bz' try: with open(filename, 'rb') as f, bz2file.open(outfilename, 'wb', level) as out: out.write(bz2.compress(bytes(f.read()))) finally: print("closing") comprsn_details(filename, outfilename, start_time)
def read(self): if six.PY2 or '__pypy__' in sys.builtin_module_names: import bz2file as bz2 # pylint: disable=import-error else: import bz2 with bz2.open(self.path, mode=self.mode, compresslevel=self.compresslevel, encoding=self.encoding, errors=self.errors, newline=self.newline) as file_content: return file_content.read()
def main(): if (len(sys.argv) < 3): print("Use: python2.7 wiki.py input.xml output.txt") return handler = wikihandler() saxparser = make_parser() saxparser.setContentHandler(handler) #source=open(sys.argv[1],"r") source = bz2file.open("enwiki-latest-pages-articles-multistream.xml.bz2", "r") saxparser.parse(source)
def load_twarr_from_bz2(bz2_file): fp = bz2file.open(bz2_file, 'r') twarr = list() for line in fp.readlines(): try: json_obj = json.loads(line.decode('utf8')) twarr.append(json_obj) except: print('Error when parsing ' + bz2_file + ': ' + line) continue fp.close() return twarr
def _extract_article_onebyone(self): wiki_pages = extract_pages( bz2file.open(self.download_wiki_articles_dump())) counter = 0 w = tqdm(wiki_pages, desc=u'get 0 article') for d in w: if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall( u'^#', d[1]): yield d counter += 1 if counter % 100 == 0: w.set_description(u'processed %s article' % counter)
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None, compresslevel=9, format=None, check=-1, preset=None, filters=None, compression=None): # pylint: disable=unexpected-keyword-arg if compression is None: return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline) elif compression == 'gz' or compression == 'gzip': if six.PY2: return gzip.open(path, mode=mode, compresslevel=compresslevel) else: return gzip.open(path, mode=mode, compresslevel=compresslevel, errors=errors, newline=newline, encoding=encoding) elif compression == 'lzma' or compression == 'xz': return lzma.open(path, mode=mode, format=format, check=check, preset=preset, filters=filters, encoding=encoding, errors=errors, newline=newline) elif compression == 'bz2': return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding, errors=errors, newline=newline) else: raise ValueError( 'compression must be None, gz, gzip, lzma, or xz and was {0}'. format(compression))
def test_to_file_compressed(self): tmp_path = 'functional/test/data/tmp/output.txt' sequence = self.seq(1, 2, 3, 4) sequence.to_file(tmp_path, compression='gzip') with gzip.open(tmp_path, 'rt') as output: self.assertEqual('[1, 2, 3, 4]', output.readlines()[0]) sequence.to_file(tmp_path, compression='lzma') with lzma.open(tmp_path, 'rt') as output: self.assertEqual('[1, 2, 3, 4]', output.readlines()[0]) sequence.to_file(tmp_path, compression='bz2') with bz2.open(tmp_path, 'rt') as output: self.assertEqual('[1, 2, 3, 4]', output.readlines()[0])
def iter_entities_from(filename, use_list=False): """ bz2 wikipedia n-triples import. for each entity returns a set of values for each defined attribute in the filename. """ print('reading', filename) with bz2file.open(filename, 'rt') as dump: prev_resource = None values = None for l in dump: if l.startswith('#'): continue l = to_unicode_or_bust(l).strip() parts = get_parts(l) if not parts: continue resource = parts['resource'] if prev_resource != resource: if prev_resource != None: yield values values = {'resource': resource} prev_resource = resource value = parts['object'] if not value: continue key = parts['predicate'] if not key in values: if use_list: values[key] = [] else: values[key] = set() if use_list: values[key].append(value) else: values[key].add(value) if values is not None: yield values
def get_file_writer(file_handle, do_gzip, do_bzip): """Generate and return a file object with specified compression.""" ofile = None if do_gzip and do_bzip: raise ValueError("Cannot specify both bzip and gzip compression!") if do_gzip: ofile = gzip.GzipFile(fileobj=file_handle, mode='w') elif do_bzip: ofile = bz2file.open(file_handle, mode='w') else: ofile = file_handle return ofile
def get_file_writer(file_handle, do_gzip, do_bzip): """Generate and return a file object with specified compression.""" ofile = None if do_gzip and do_bzip: raise Exception("Cannot specify both bzip and gzip compression!") if do_gzip: ofile = gzip.GzipFile(fileobj=file_handle, mode='w') elif do_bzip: ofile = bz2file.open(file_handle, mode='w') else: ofile = file_handle return ofile
def wiki_bz_process(self,language): wiki = extract_pages(bz2file.open(self.corpus)) f = codecs.open(os.path.join(os.path.dirname(self.corpus), 'wiki.txt'), 'w', encoding='utf-8') w = tqdm(wiki, desc="Currently get 0 files!") if language=='zh': for i, d in enumerate(w): if not re.findall('^[a-zA-Z]+:', d[0]) and not re.findall(u'^#', d[1]): s = self.wiki_replace(d) f.write(s + '\n\n\n') i += 1 if i % 100 == 0: w.set_description('Currently got %s files' % i) elif language=='en': pass
def bz2_vocab_reader(fpath): with bz2file.open(fpath, 'r') as f: info = f.readline() while True: data = f.readline() data = data.decode('utf-8').strip() if data == "": break try: t_data = data.split() word = t_data[0].strip() embed = [float(v) for v in t_data[-300:]] except Exception: continue yield word, embed
def smart_open(filename): ''' Returns an open file object if `filename` is plain text, else assumes it is a bzip2 compressed file and returns a file-like object to handle it. ''' if isplaintext(filename): f = open(filename, 'rt') else: file_type = mimetype(filename) if file_type.find('gzip') > -1: f = gzip.GzipFile(filename, 'rt') elif file_type.find('bzip2') > -1: f = bz2file.open(filename, 'rt') else: pass # Not supported format return f
def wiki_process(input_file, save_path): # wikicorpus解析 wiki = extract_pages(bz2file.open(input_file)) # 处理并导出 i = 0 f = codecs.open(save_path, 'w', encoding='utf-8') w = tqdm(wiki, desc=u'已获取0篇文章') openCC = OpenCC('t2s') for d in w: if not re.findall('^[a-zA-Z]+:', d[0]) and d[0] and not re.findall( u'^#', d[1]): s = wiki_replace(d) f.write(s + '\n\n\n') i += 1 if i % 100 == 0: w.set_description(u'已获取%s篇文章' % i) f.close()
def iter_entities_from(filename): """ bz2 wikipedia n-triples import. for each entity returns a set of values for each defined attribute in the filename. """ print("reading", filename) with bz2file.open(filename, "rt") as dump: prev_resource = None values = None for l in dump: if l.startswith("#"): continue l = to_unicode_or_bust(l).strip() parts = get_parts(l) if not parts: continue resource = parts["resource"] if prev_resource != resource: if prev_resource != None: yield values values = {"resource": resource} prev_resource = resource value = parts["object"] if not value: continue key = parts["predicate"] if not key in values: values[key] = set() values[key].add(value) if values is not None: yield values
def load_single_file(file_str): textset = [] retweet = 0 favorite = 0 infile = bz2file.open(file_str, 'r') lines = infile.readlines() data_list = map(json.loads, lines) cleanr = re.compile('(@[A-Za-z0-9]+)|([^0-9A-Za-z\t])|(\w+:\/\/\S+)') for data in data_list: if 'text' in data: try: if 'blockchain' in data['text'].lower(): cleantext = re.sub(cleanr, ' ', data['text']).split() textset.append(cleantext) retweet += data['retweet_count'] favorite += data['favorite_count'] except: pass return textset, retweet, favorite
def __init__(self,filePath,headerSymbols=['@','+']): """ Returns a read-by-read fastQ parser analogous to file.readline(). Exmpl: parser.next() -OR- Its an iterator so you can do: for rec in parser: ... do something with rec ... rec is tuple: (seqHeader,seqStr,qualHeader,qualStr) """ if filePath.endswith('.gz'): self._file = gzip.open(filePath) elif filePath.endswith('.bz2'): self._file = bz2file.open(filePath, 'rt') else: self._file = open(filePath, 'r+') self._currentLineNumber = 0 self._hdSyms = headerSymbols
def read_fq_list(filename): if filename.endswith(".gz"): h = gzip.open(filename) elif filename.endswith(".bz2"): h = bz2file.open(filename) else: h = open(filename) seq = [] for n, line in enumerate(h): line = line.strip() if n % 4 == 0: if seq: yield seq seq = [ line, ] else: seq.append(line) h.close() yield seq
def loadBz2fileDataSet(filePath, symbol, low=-1, high=1000): # 当给定low>=0时,是一种省内存费时间的切片方法 dataMat = [] fr = bz2file.open(filePath, "r") if low < 0: for line in fr.readlines(): lineArr = line.strip().split(symbol) dataMat.append(map(float, lineArr)) else: count = 1 for line in fr.readlines(): if count >= low: lineArr = line.strip().split(symbol) dataMat.append(map(float, lineArr)) if count == high: break count += 1 return dataMat
def get_wiki2(): from opencc import OpenCC # 参考这篇博客注释 # https://kexue.fm/archives/4176 opencc1 = OpenCC("t2s") reobj1 = re.compile( "[ `~!@#$%^&*\(\)-_=+\[\]\{\}\\\|;:\'\",<.>/?a-zA-Z\d]+") reobj2 = re.compile(r"\n+") reobj3 = re.compile("(())|(“”)|(「」)|(《》)|(“”)|(‘’)|(【】)|[,。?——!]{2,}") reuseful = re.compile('^[a-zA-Z]+:') redirect = re.compile(r"^#") def wiki_replace(s): s = filter_wiki(s) s = reobj1.sub("", s) # 为上传阿里云剔除竖线(|)符号 s = reobj2.sub("#", s) s = reobj3.sub("", s) return opencc1.convert(s).strip() wiki = extract_pages( bz2file.open( r'E:\02program\python\nlp\data\corpus\zhwiki-latest-pages-articles.xml.bz2' )) with codecs.open('wiki-tw.csv', 'w', encoding='utf-8') as f: i = 0 filelist = [] for d in tqdm(wiki): if not reuseful.findall(d[0]) and not redirect.findall(d[1]): i += 1 try: filelist.append( reobj1.sub("", d[0]) + "|" + wiki_replace(d[1]) + "\n") except Exception as e: print(d[0], '=' * 10, d[1]) if i % 1000 == 0: s = ("".join(filelist)) f.write(s) filelist = [] if filelist: s = ("".join(filelist)) f.write(s)
def documents(self): """Iterates over pages in the wikinews archive. Yields: docid, title, url, curid, revid, wiki_doc """ logging.info("Extracting docs from [%s]", self._wikinews_archive) file_list = glob.glob(self._wikinews_archive) assert file_list, self._wikinews_archive for archive in file_list: with bz2file.open(archive, "rt", encoding="utf-8", errors="strict") as xf: # One line of json as produced by wikiextractor. for line in xf: record = json.loads(line) # Extract page title. title = html.unescape(record["title"]) curid = record["id"] # pageid, e.g. 73052 revid = record["revid"] # revision, e.g. 730271 url = record[ "url"] # e.g. https://de.wikinews.org/wiki?curid=73052 logging.debug("Got title: %s", title) # Skip pages that don't have text. wiki_doc = record.get("text") if not wiki_doc: self._counter["no text"] += 1 self._filtered_no_text.append(url) logging.debug("Skip: no text element") continue # Apply manual fixes. wiki_doc = _apply_wiki_overrides(wiki_doc, url) # Create internal document identifier. docid = f"{self._language}-{curid}" logging.debug("Found (%s): %s", docid, title) self._counter["found"] += 1 yield docid, title, url, curid, revid, wiki_doc
def universal_write_open(path, mode, buffering=-1, encoding=None, errors=None, newline=None, compresslevel=9, format=None, check=-1, preset=None, filters=None, compression=None): # pylint: disable=unexpected-keyword-arg if compression is None: return builtins.open(path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline) elif compression == 'gz' or compression == 'gzip': if six.PY2: return gzip.open(path, mode=mode, compresslevel=compresslevel) else: return gzip.open(path, mode=mode, compresslevel=compresslevel, errors=errors, newline=newline, encoding=encoding) elif compression == 'lzma' or compression == 'xz': return lzma.open(path, mode=mode, format=format, check=check, preset=preset, filters=filters, encoding=encoding, errors=errors, newline=newline) elif compression == 'bz2': return bz2.open(path, mode=mode, compresslevel=compresslevel, encoding=encoding, errors=errors, newline=newline) else: raise ValueError( 'compression must be None, gz, gzip, lzma, or xz and was {0}'.format(compression))
STOPWORDS = set() URL_STOPWORDS = set() with open('labels.json', 'r') as f: labels = json.load(f) with open('stopwords.txt', 'r') as f: for line in f: STOPWORDS.add(line.rstrip()) with open('url_stopwords.txt', 'r') as f: for line in f: URL_STOPWORDS.add(line.rstrip()) line_no = 0 with bz2file.open(sys.argv[1], 'rb') as tsvfile: line_no += 1 with open(sys.argv[2], 'w') as arfffile: csvwriter = csv.writer(arfffile, quoting=csv.QUOTE_ALL) arfffile.write("@relation 40-cat-training\n") """ arfffile.write("@attribute url string\n") arfffile.write("@attribute title string\n") """ arfffile.write("@attribute tokens string\n") arfffile.write("@attribute klass {{{0}}}\n".format(",".join(labels))) arfffile.write("@data\n") for line in tsvfile: line = line.strip()
import sys sys.path.remove( '/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python' ) #sys.path.remove('/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/PyObjC') sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages') #sys.path.append('/Users/xujiaxing/anaconda/lib/python3.6/site-packages/opencc.py') from gensim.corpora.wikicorpus import extract_pages, filter_wiki import bz2file import re import opencc from tqdm import tqdm import codecs wiki = extract_pages(bz2file.open('zhwiki-latest-pages-articles.xml.bz2')) def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return opencc.convert(s).strip()
if __name__ == "__main__": # mongo init client = MongoClient("mongodb://*****:*****@ds039860.mongolab.com:39860/changesets") db = client.changesets changesetcollection = db.changesets metacollection = db.meta # counters i = 0 j = 0 k = 0 for root, dirs, files in os.walk(BASEDIR): dirs.sort(reverse=True) files.sort(reverse=True) for name in files: if name.endswith('.bz2'): f = bz2file.open(os.path.join(root, name), 'rb') i += 1 docs = [] try: for event, elem in ET.iterparse(f): if elem.tag == "changeset": # check if changeset has bbox metadata (is not empty) if all(k in elem.attrib for k in ('min_lat', 'max_lat', 'min_lon', 'max_lon')): min_lat = float(elem.attrib['min_lat']) min_lon = float(elem.attrib['min_lon']) max_lat = float(elem.attrib['max_lat']) max_lon = float(elem.attrib['max_lon']) bbox = box(min_lon, min_lat, max_lon, max_lat) # send output to stdout if not i % BULK_SIZE: sys.stdout.write('.')
def open(self, *args, **kwargs): return bz2file.open(*args, **kwargs)
mlp.Layer("Rectifier", units=64, dropout=0.3), mlp.Layer("Rectifier", units=48, dropout=0.1), mlp.Layer("Rectifier", units=32), mlp.Layer("Softmax")], learning_rate=0.01, learning_rule='rmsprop', n_iter=10, n_stable=10, batch_size=50, valid_set=(X,y), verbose=1) try: nn.fit(X, y) except KeyboardInterrupt: pass score = nn.score(X, y) print('SCORE:', score * 100.0) print('MISMATCHES:') yp = nn.predict(X) y = y.reshape(yp.shape) for a in numpy.where(y != yp)[0]: print(ds.filenames[a], 'was', int(yp[a]), 'not', int(y[a])) with bz2.open('detector_train.pkl.bz2', 'wb') as f: pickle.dump(nn, f)