def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args): global b_dirs num_instances = len(paths) num_features = max(i for v in tk_output.values() for i in v) + 1 # Generate the feature map nm_arr = mp.Array('i', tk_nextmove, lock=False) if args.jobs: chunksize = min(len(paths) / (args.jobs * 2), args.chunksize) else: chunksize = min(len(paths) / (mp.cpu_count() * 2), args.chunksize) # TODO: Set the output dir b_dirs = [ tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path) for i in range(args.buckets) ] output_states = set(tk_output) path_chunks = list(chunk(paths, chunksize)) pass_tokenize_arg = zip(offsets(path_chunks), path_chunks) pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs) with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f: pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg) write_count = sum(pass_tokenize_out) if not SILENT: print "wrote a total of %d keys" % write_count pass_ptc_params = (cm, num_instances) with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f: pass_ptc_out = f(pass_ptc, b_dirs) reads, ids, prods = zip(*pass_ptc_out) read_count = sum(reads) if not SILENT: print "read a total of %d keys (%d short)" % (read_count, write_count - read_count) prod = np.zeros((num_features, cm.shape[1]), dtype=int) prod[np.concatenate(ids)] = np.vstack(prods) ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0)) nb_ptc = array.array('d') for term_dist in ptc.tolist(): nb_ptc.extend(term_dist) return nb_ptc
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None, term_freq=False, line_level=False): """ @param items a list of (domain, language, path) tuples """ global b_dirs, complete # Our exitfunc uses this to know whether to delete the tokenized files complete = False if jobs is None: jobs = mp.cpu_count() + 4 b_dirs = [ os.path.join(outdir, "bucket{0}".format(i)) for i in range(buckets) ] for d in b_dirs: os.mkdir(d) # PASS 1: Tokenize documents into sets of terms # If there are few items, make the chunk size such that each job # will have 2 chunks chunk_size = max(1, min(len(items) / (jobs * 2), chunksize)) item_chunks = list(chunk(items, chunk_size)) pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size, term_freq, line_level) with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f: pass_tokenize_out = f(pass_tokenize, item_chunks) doc_count = defaultdict(int) chunk_count = len(item_chunks) print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count) print "job count: {0}".format(jobs) if sample_count: print "sampling-based tokenization: size {0} count {1}".format( sample_size, sample_count) else: print "whole-document tokenization" for i, keycount in enumerate(pass_tokenize_out): print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count, keycount) complete = True return b_dirs
def learn_ftc(paths, tk_nextmove, tk_output, cm, temp_path, args): global b_dirs num_instances = len(paths) num_features = max(i for v in tk_output.values() for i in v) + 1 # Generate the feature map nm_arr = mp.Array('i', tk_nextmove, lock=False) if args.jobs: chunksize = min(len(paths) / (args.jobs * 2), args.chunksize) else: chunksize = min(len(paths) / (mp.cpu_count() * 2), args.chunksize) # TODO: Set the output dir b_dirs = [ tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path) for i in range(args.buckets) ] output_states = set(tk_output) path_chunks = list(chunk(paths, chunksize)) pass_tokenize_arg = zip(offsets(path_chunks), path_chunks) pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs) with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f: pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg) write_count = sum(pass_tokenize_out) logger.info("wrote a total of %d keys", write_count) # TODO: Report on the progress of this pass pass_ftc_params = (cm, num_instances) with MapPool(args.jobs, setup_pass_ftc, pass_ftc_params) as f: pass_ftc_out = f(pass_ftc, b_dirs) reads, ids, prods = zip(*pass_ftc_out) read_count = sum(reads) logger.info("read a total of %d keys (%d short)", read_count, write_count - read_count) # Re-order the weights into a single ndarray term_lang_counts = np.zeros((num_features, cm.shape[1]), dtype=int) term_lang_counts[np.concatenate(ids)] = np.vstack(prods) return term_lang_counts
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None, term_freq=False, line_level=False): global b_dirs, complete #判断是否删除标记的文件 complete = False if jobs is None: jobs = mp.cpu_count() + 4 b_dirs = [ os.path.join(outdir, "bucket{0}".format(i)) for i in range(buckets) ] for d in b_dirs: os.mkdir(d) # PASS 1: 将文档分为几组 chunk_size = max(1, min(len(items) / (jobs * 2), chunksize)) item_chunks = list(chunk(items, chunk_size)) pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size, term_freq, line_level) with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f: pass_tokenize_out = f(pass_tokenize, item_chunks) doc_count = defaultdict(int) chunk_count = len(item_chunks) print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count) print "job count: {0}".format(jobs) if sample_count: print "sampling-based tokenization: size {0} count {1}".format( sample_size, sample_count) else: print "whole-document tokenization" for i, keycount in enumerate(pass_tokenize_out): print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count, keycount) complete = True return b_dirs
def tally_lf(bucketlist, jobs=None): """ Sum up k,v pairs across all buckets. This builds a global mapping of terms to the number of languages the terms occur in """ lang_count = {} with MapPool(jobs) as f: pass_sum_lf_out = f(pass_sum_lf, bucketlist) for i, v in enumerate(pass_sum_lf_out): lang_count.update(v) logger.debug("processed bucket ({0}/{1}) [{2} terms]".format( i + 1, len(bucketlist), len(v))) return lang_count
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE): """ @param items a list of (domain, language, path) tuples """ global b_dirs, complete # Our exitfunc uses this to know whether to delete the tokenized files complete = False if jobs is None: jobs = mp.cpu_count() + 4 b_dirs = [ tempfile.mkdtemp(prefix="tokenize-", suffix='-{0}'.format(tokenizer.__class__.__name__), dir=outdir) for i in range(buckets) ] # PASS 1: Tokenize documents into sets of terms # If there are few items, make the chunk size such that each job # will have 2 chunks chunk_size = max(1, min(len(items) / (jobs * 2), chunksize)) item_chunks = list(chunk(items, chunk_size)) pass_tokenize_globals = (tokenizer, b_dirs) with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f: pass_tokenize_out = f(pass_tokenize, item_chunks) doc_count = defaultdict(int) chunk_count = len(item_chunks) print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count) print "job count: {0}".format(jobs) for i, keycount in enumerate(pass_tokenize_out): print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count, keycount) complete = True return b_dirs
def prager_select(bucketlist, lang_count, k, jobs=None): """ Compute the feature selection score according to Prager (1999). This is basically a tf-idf computation (where the 'df' used is number of languages a term occurs in rather than number of training documents). @param k threshold value for selection. We select when score > k """ features = set() with MapPool(jobs, setup_pass_select, (lang_count, k)) as f: pass_select_out = f(pass_select, bucketlist) for i, feats in enumerate(pass_select_out): features |= feats logger.debug("processed bucket ({0}/{1}) [selected {2}]".format( i + 1, len(bucketlist), len(feats))) return features
def tally(bucketlist, jobs=None): """ Sum up the counts for each feature across all buckets. This builds a full mapping of feature->count. This is stored in-memory and thus could be an issue for large feature sets. """ with MapPool(jobs) as f: pass_sum_df_out = f(pass_sum_df, bucketlist) for i, keycount in enumerate(pass_sum_df_out): print "processed bucket (%d/%d) [%d keys]" % ( i + 1, len(bucketlist), keycount) # build the global term->df mapping doc_count = {} for bucket in bucketlist: for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')): doc_count[key] = value return doc_count
def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None): pass_IG_args = (features, dist, binarize, suffix) num_chunk = len(bucketlist) weights = [] terms = [] with MapPool(job_count, setup_pass_IG, pass_IG_args) as f: pass_IG_out = f(pass_IG, bucketlist) for i, (t, w) in enumerate(pass_IG_out): weights.append(w) terms.extend(t) print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t)) if binarize: weights = numpy.hstack(weights).transpose() else: weights = numpy.concatenate(weights) terms = ["".join(t) for t in terms] return zip(terms, weights)
def tfilf_select(bucketlist, lang_count, count, jobs=None): """ Do a feature selection based on the top-N features, using the same scoring as Prager but with a fixed number of features rather than a floating number as selected by K. We optimize slightly by observing that `count`, the total number to select, can in the most corner of cases only come from 1 bucket; hence, we select `count` from each bucket, then the top `count` thereof. """ features = [] with MapPool(jobs, setup_pass_tfilf, ( lang_count, count, )) as f: pass_tfilf_out = f(pass_tfilf, bucketlist) for i, feats in enumerate(pass_tfilf_out): # Keep selecting n-largest from the previous output and the new candidates features = heapq.nlargest(count, itertools.chain(features, feats)) logger.debug("processed bucket ({0}/{1})".format( i + 1, len(bucketlist))) return [f for c, r, f in sorted(features, reverse=True)]
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args): """ @param items label, path pairs """ global outdir # Generate the feature map nm_arr = mp.Array('i', tk_nextmove, lock=False) if args.jobs: tasks = args.jobs * 2 else: tasks = mp.cpu_count() * 2 # Ensure chunksize of at least 1, but not exceeding specified chunksize chunksize = max(1, min(len(items) / tasks, args.chunksize)) outdir = tempfile.mkdtemp(prefix="NBtrain-",suffix='-buckets', dir=temp_path) b_dirs = [ os.path.join(outdir,"bucket{0}".format(i)) for i in range(args.buckets) ] for d in b_dirs: os.mkdir(d) output_states = set(tk_output) # Divide all the items to be processed into chunks, and enumerate each chunk. item_chunks = list(chunk(items, chunksize)) pass_tokenize_arg = enumerate(item_chunks) pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs, args.line) with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f: pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg) write_count = 0 chunk_sizes = {} labels = [] for chunk_id, doc_count, writes, _labels in pass_tokenize_out: write_count += writes chunk_sizes[chunk_id] = doc_count labels.extend(_labels) print "wrote a total of %d keys" % write_count num_instances = sum(chunk_sizes.values()) print "processed a total of %d instances" % num_instances chunk_offsets = {} for i in range(len(chunk_sizes)): chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i)) print " offset for chunk {0} is {1}".format(i, chunk_offsets[i]) pass_fm_params = (num_instances, chunk_offsets) with MapPool(args.jobs, setup_pass_fm, pass_fm_params) as f: pass_fm_out = f(pass_fm, b_dirs) reads, ids, fms = zip(*pass_fm_out) read_count = sum(reads) print "read a total of %d keys (%d short)" % (read_count, write_count - read_count) num_features = max( i for v in tk_output.values() for i in v) + 1 fm = np.zeros((num_features, num_instances), dtype=int) fm[np.concatenate(ids)] = np.vstack(fms) print "have {} labels".format(len(labels)) cm = np.zeros((num_instances, num_langs), dtype='bool') for doc_id, lang_id in enumerate(labels): cm[doc_id, lang_id] = True # This is where the smoothing occurs prod = np.dot(fm, cm) ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0)) nb_ptc = array.array('d') for term_dist in ptc.tolist(): nb_ptc.extend(term_dist) pc = np.log(cm.sum(0)) nb_pc = array.array('d', pc) return nb_pc, nb_ptc
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args): """ @param items label, path pairs """ global outdir print "learning NB parameters on {} items".format(len(items)) # Generate the feature map nm_arr = mp.Array('i', tk_nextmove, lock=False) if args.jobs: tasks = args.jobs * 2 else: tasks = mp.cpu_count() * 2 # Ensure chunksize of at least 1, but not exceeding specified chunksize chunksize = max(1, min(len(items) / tasks, args.chunksize)) outdir = tempfile.mkdtemp(prefix="NBtrain-", suffix='-buckets', dir=temp_path) b_dirs = [ os.path.join(outdir, "bucket{0}".format(i)) for i in range(args.buckets) ] for d in b_dirs: os.mkdir(d) output_states = set(tk_output) # Divide all the items to be processed into chunks, and enumerate each chunk. item_chunks = list(chunk(items, chunksize)) num_chunks = len(item_chunks) print "about to tokenize {} chunks".format(num_chunks) pass_tokenize_arg = enumerate(item_chunks) pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs, args.line) with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f: pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg) write_count = 0 chunk_sizes = {} chunk_labels = [] for i, (chunk_id, doc_count, writes, labels) in enumerate(pass_tokenize_out): write_count += writes chunk_sizes[chunk_id] = doc_count chunk_labels.append((chunk_id, labels)) print "processed chunk ID:{0} ({1}/{2}) [{3} keys]".format( chunk_id, i + 1, num_chunks, writes) print "wrote a total of %d keys" % write_count num_instances = sum(chunk_sizes.values()) print "processed a total of %d instances" % num_instances chunk_offsets = {} for i in range(len(chunk_sizes)): chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i)) # Build CM based on re-ordeing chunk cm = np.zeros((num_instances, num_langs), dtype='bool') for chunk_id, chunk_label in chunk_labels: for doc_id, lang_id in enumerate(chunk_label): index = doc_id + chunk_offsets[chunk_id] cm[index, lang_id] = True pass_ptc_params = (cm, num_instances, chunk_offsets) with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f: pass_ptc_out = f(pass_ptc, b_dirs) def pass_ptc_progress(): for i, v in enumerate(pass_ptc_out): yield v print "processed chunk ({0}/{1})".format(i + 1, len(b_dirs)) reads, ids, prods = zip(*pass_ptc_progress()) read_count = sum(reads) print "read a total of %d keys (%d short)" % (read_count, write_count - read_count) num_features = max(i for v in tk_output.values() for i in v) + 1 prod = np.zeros((num_features, cm.shape[1]), dtype=int) prod[np.concatenate(ids)] = np.vstack(prods) # This is where the smoothing occurs ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0)) nb_ptc = array.array('d') for term_dist in ptc.tolist(): nb_ptc.extend(term_dist) pc = np.log(cm.sum(0)) nb_pc = array.array('d', pc) return nb_pc, nb_ptc
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None): """ @param items a list of (domain, language, path) tuples """ global b_dirs, complete # Our exitfunc uses this to know whether to delete the tokenized files complete = False if jobs is None: jobs = mp.cpu_count() + 4 b_dirs = [ tempfile.mkdtemp(prefix="tokenize-", suffix='-{0}'.format(tokenizer.__class__.__name__), dir=outdir) for i in range(buckets) ] # PASS 1: Tokenize documents into sets of terms # If there are few items, make the chunk size such that each job # will have 2 chunks chunk_size = max(1, min(len(items) / (jobs * 2), chunksize)) item_chunks = list(chunk(items, chunk_size)) pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size) with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f: pass_tokenize_out = f(pass_tokenize, item_chunks) doc_count = defaultdict(int) chunk_count = len(item_chunks) logger.info("chunk size: {0} ({1} chunks)".format( chunk_size, chunk_count)) logger.info("job count: {0}".format(jobs)) if sample_count: logger.info( "sampling-based tokenization: size {0} count {1}".format( sample_size, sample_count)) else: logger.info("whole-document tokenization") total_bytes = 0 for i, chunk_bytes in enumerate(pass_tokenize_out): logger.debug("tokenized chunk (%d/%d) [%d bytes]" % (i + 1, chunk_count, chunk_bytes)) total_bytes += chunk_bytes logger.info("tokenized a total of {0} MB".format(total_bytes / 1024 / 1024)) complete = True return b_dirs