def pass_ptc(b_dir): """ Take a bucket, form a feature map, compute the count of each feature in each class. @param b_dir path to the bucket directory @returns (read_count, f_ids, prod) """ global __cm, __num_instances, __chunk_offsets terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int')) read_count = 0 for path in os.listdir(b_dir): if path.endswith('.index'): for f_id, chunk_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)): index = doc_id + __chunk_offsets[chunk_id] terms[f_id][index] = count read_count += 1 f_ids, f_vs = zip(*terms.items()) fm = np.vstack(f_vs) # The calculation of the term-class distribution is done per-chunk rather # than globally for memory efficiency reasons. prod = np.dot(fm, __cm) return read_count, f_ids, prod
def pass_ptc(b_dir): """ Take a bucket, form a feature map, compute the count of each feature in each class. @param b_dir path to the bucket directory @returns (read_count, f_ids, prod) """ global __cm, __num_instances, __chunk_offsets terms = defaultdict(lambda: np.zeros((__num_instances, ), dtype='int')) read_count = 0 for path in os.listdir(b_dir): if path.endswith('.index'): for f_id, chunk_id, doc_id, count in unmarshal_iter( os.path.join(b_dir, path)): index = doc_id + __chunk_offsets[chunk_id] terms[f_id][index] = count read_count += 1 f_ids, f_vs = zip(*terms.items()) fm = np.vstack(f_vs) # The calculation of the term-class distribution is done per-chunk rather # than globally for memory efficiency reasons. prod = np.dot(fm, __cm) return read_count, f_ids, prod
def pass_sum_lf(bucket): """ Compute 'document frequency' as defined by Prager(1999) - the number of languages a term appears in. """ term_langs = defaultdict(set) for path in os.listdir(bucket): if path.endswith('.lang'): for key, lang, _ in unmarshal_iter(os.path.join(bucket,path)): term_langs[key].add(lang) retval = dict( (k,len(v)) for k,v in term_langs.iteritems() ) return retval
def pass_sum_lf(bucket): """ Compute 'document frequency' as defined by Prager(1999) - the number of languages a term appears in. """ term_langs = defaultdict(set) for path in os.listdir(bucket): if path.endswith('.lang'): for key, lang, _ in unmarshal_iter(os.path.join(bucket, path)): term_langs[key].add(lang) retval = dict((k, len(v)) for k, v in term_langs.iteritems()) return retval
def pass_sum_df(bucket): """ Compute document frequency (df) by summing up (key,domain,count) triplets over all domains. """ doc_count = defaultdict(int) count = 0 with gzip.open(os.path.join(bucket, "docfreq"),'wb') as docfreq: for path in os.listdir(bucket): # We use the domain buckets as there are usually less domains if path.endswith('.domain'): for key, _, value in unmarshal_iter(os.path.join(bucket,path)): doc_count[key] += value count += 1 for item in doc_count.iteritems(): docfreq.write(marshal.dumps(item)) return count
def pass_ptc(b_dir): global __cm, __num_instances, __chunk_offsets terms = defaultdict(lambda: np.zeros((__num_instances, ), dtype='int')) read_count = 0 for path in os.listdir(b_dir): if path.endswith('.index'): for f_id, chunk_id, doc_id, count in unmarshal_iter( os.path.join(b_dir, path)): index = doc_id + __chunk_offsets[chunk_id] terms[f_id][index] = count read_count += 1 f_ids, f_vs = zip(*terms.items()) fm = np.vstack(f_vs) prod = np.dot(fm, __cm) return read_count, f_ids, prod
def pass_select(bucket): """ Select features from a chunk that meet our selection criteria. """ global __lang_count, __k # Compute the term-language frequency first term_lang_count = defaultdict(int) for path in os.listdir(bucket): if path.endswith('.lang'): for key, lang, value in unmarshal_iter(os.path.join(bucket, path)): term_lang_count[key, lang] += value features = set() for (term, lang), count in term_lang_count.iteritems(): if count >= __k * __lang_count[term]: features.add(term) return features
def pass_select(bucket): """ Select features from a chunk that meet our selection criteria. """ global __lang_count, __k # Compute the term-language frequency first term_lang_count = defaultdict(int) for path in os.listdir(bucket): if path.endswith('.lang'): for key, lang, value in unmarshal_iter(os.path.join(bucket,path)): term_lang_count[key, lang] += value features = set() for (term, lang), count in term_lang_count.iteritems(): if count >= __k * __lang_count[term]: features.add(term) return features
def tally(bucketlist, jobs=None): """ Sum up the counts for each feature across all buckets. This builds a full mapping of feature->count. This is stored in-memory and thus could be an issue for large feature sets. """ with MapPool(jobs) as f: pass_sum_df_out = f(pass_sum_df, bucketlist) for i, keycount in enumerate(pass_sum_df_out): print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount) # build the global term->df mapping doc_count = {} for bucket in bucketlist: for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')): doc_count[key] = value return doc_count
def tally(bucketlist, jobs=None): """ Sum up the counts for each feature across all buckets. This builds a full mapping of feature->count. This is stored in-memory and thus could be an issue for large feature sets. """ with MapPool(jobs) as f: pass_sum_df_out = f(pass_sum_df, bucketlist) for i, keycount in enumerate(pass_sum_df_out): print "processed bucket (%d/%d) [%d keys]" % ( i + 1, len(bucketlist), keycount) # build the global term->df mapping doc_count = {} for bucket in bucketlist: for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')): doc_count[key] = value return doc_count
def pass_fm(b_dir): """ Take a bucket, form a feature map, compute the count of each feature in each class. @param b_dir path to the bucket directory @returns (read_count, f_ids, prod) """ global __num_instances, __chunk_offsets terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int')) read_count = 0 for path in os.listdir(b_dir): if path.endswith('.index'): for f_id, chunk_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)): index = doc_id + __chunk_offsets[chunk_id] terms[f_id][index] = count read_count += 1 f_ids, f_vs = zip(*terms.items()) fm = np.vstack(f_vs) return read_count, f_ids, fm
def pass_tfilf(bucket): """ Select top-n features from a chunk by Prager's TFILF criteria. """ global __lang_count, __count # Compute the term-language frequency first term_lang_count = defaultdict(lambda:defaultdict(int)) for path in os.listdir(bucket): if path.endswith('.lang'): for key, lang, value in unmarshal_iter(os.path.join(bucket,path)): term_lang_count[key][lang] += value def f_iter(): for term in term_lang_count: lf = float(__lang_count[term]) tf = max(term_lang_count[term].values()) # most frequent language rval = random.random() # this is used to randomize the order of the same-score items yield (tf/lf, rval, term) # TF-ILF score as described by Prager retval = heapq.nlargest(__count, f_iter()) return retval
def pass_tfilf(bucket): """ Select top-n features from a chunk by Prager's TFILF criteria. """ global __lang_count, __count # Compute the term-language frequency first term_lang_count = defaultdict(lambda: defaultdict(int)) for path in os.listdir(bucket): if path.endswith('.lang'): for key, lang, value in unmarshal_iter(os.path.join(bucket, path)): term_lang_count[key][lang] += value def f_iter(): for term in term_lang_count: lf = float(__lang_count[term]) tf = max(term_lang_count[term].values()) # most frequent language rval = random.random( ) # this is used to randomize the order of the same-score items yield (tf / lf, rval, term) # TF-ILF score as described by Prager retval = heapq.nlargest(__count, f_iter()) return retval
def pass_IG(bucket): """ In this pass we compute the information gain for each feature, binarized with respect to each language as well as unified over the set of all classes. @global __features the list of features to compute IG for @global __dist the background distribution @global __binarize (boolean) compute IG binarized per-class if True @global __suffix of files in bucketdir to process @param bucket the bucket file to process. It is assumed to contain marshalled (term, event_id, count) triplets. """ global __features, __dist, __binarize, __suffix # We first tally the per-event frequency of each # term in our selected feature set. term_freq = defaultdict(lambda: defaultdict(int)) term_index = defaultdict(Enumerator()) for path in os.listdir(bucket): if path.endswith(__suffix): for key, event_id, count in unmarshal_iter(os.path.join(bucket, path)): # Select only our listed features if key in __features: term_index[key] term_freq[key][event_id] += count num_term = len(term_index) num_event = len(__dist) cm_pos = numpy.zeros((num_term, num_event), dtype="int") for term, term_id in term_index.iteritems(): # update event matrix freq = term_freq[term] for event_id, count in freq.iteritems(): cm_pos[term_id, event_id] = count cm_neg = __dist - cm_pos cm = numpy.dstack((cm_neg, cm_pos)) if not __binarize: # non-binarized event space x = cm.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) # Entropy of the term-present/term-absent events e = entropy(cm, axis=1) # Information Gain with respect to the set of events ig = entropy(__dist) - (term_w * e).sum(axis=1) else: # binarized event space # Compute IG binarized with respect to each event ig = list() for event_id in xrange(num_event): num_doc = __dist.sum() prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term)) cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :] cm_bin[:, 1, :] = cm[:, event_id, :] e = entropy(cm_bin, axis=1) x = cm_bin.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) ig.append(entropy(prior) - (term_w * e).sum(axis=1)) ig = numpy.vstack(ig) terms = sorted(term_index, key=term_index.get) return terms, ig
def pass_IG(buckets): """ In this pass we compute the information gain for each feature, binarized with respect to each language as well as unified over the set of all classes. @global __features the list of features to compute IG for @global __dist the background distribution @global __binarize (boolean) compute IG binarized per-class if True @global __suffix of files in bucketdir to process @param buckets a list of buckets. Each bucket must be a directory that contains files with the appropriate suffix. Each file must contain marshalled (term, event_id, count) triplets. """ global __features, __dist, __binarize, __suffix # We first tally the per-event frequency of each # term in our selected feature set. term_freq = defaultdict(lambda: defaultdict(int)) term_index = defaultdict(Enumerator()) for bucket in buckets: for path in os.listdir(bucket): if path.endswith(__suffix): for key, event_id, count in unmarshal_iter( os.path.join(bucket, path)): # Select only our listed features if key in __features: term_index[key] term_freq[key][event_id] += count num_term = len(term_index) num_event = len(__dist) cm_pos = numpy.zeros((num_term, num_event), dtype='int') for term, term_id in term_index.iteritems(): # update event matrix freq = term_freq[term] for event_id, count in freq.iteritems(): cm_pos[term_id, int(event_id)] = count cm_neg = __dist - cm_pos cm = numpy.dstack((cm_neg, cm_pos)) if not __binarize: # non-binarized event space x = cm.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) # Entropy of the term-present/term-absent events e = entropy(cm, axis=1) # Information Gain with respect to the set of events ig = entropy(__dist) - (term_w * e).sum(axis=1) else: # binarized event space # Compute IG binarized with respect to each event ig = list() for event_id in xrange(num_event): num_doc = __dist.sum() prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term)) cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :] cm_bin[:, 1, :] = cm[:, event_id, :] e = entropy(cm_bin, axis=1) x = cm_bin.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) ig.append(entropy(prior) - (term_w * e).sum(axis=1)) ig = numpy.vstack(ig) terms = sorted(term_index, key=term_index.get) return terms, ig
def pass_IG(buckets): global __features, __dist, __binarize, __suffix #首先计算所选特征集中每个特征的每个事件的频率 term_freq = defaultdict(lambda: defaultdict(int)) term_index = defaultdict(Enumerator()) for bucket in buckets: for path in os.listdir(bucket): if path.endswith(__suffix): for key, event_id, count in unmarshal_iter( os.path.join(bucket, path)): # 只选择列出的特征 if key in __features: term_index[key] term_freq[key][event_id] += count num_term = len(term_index) num_event = len(__dist) cm_pos = numpy.zeros((num_term, num_event), dtype='int') for term, term_id in term_index.iteritems(): # 更新事件矩阵 freq = term_freq[term] for event_id, count in freq.iteritems(): cm_pos[term_id, event_id] = count cm_neg = __dist - cm_pos cm = numpy.dstack((cm_neg, cm_pos)) if not __binarize: # 非二值化的事件空间 x = cm.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) #包含该术语与否的熵 e = entropy(cm, axis=1) # 计算到的IG ig = entropy(__dist) - (term_w * e).sum(axis=1) else: ig = list() for event_id in xrange(num_event): num_doc = __dist.sum() prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term)) cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :] cm_bin[:, 1, :] = cm[:, event_id, :] e = entropy(cm_bin, axis=1) x = cm_bin.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) ig.append(entropy(prior) - (term_w * e).sum(axis=1)) ig = numpy.vstack(ig) terms = sorted(term_index, key=term_index.get) return terms, ig