def prune_min_domain(self, min_domain): # prune files for all languages that do not occur in at least min_domain # Work out which languages to reject as they are not present in at least # the required number of domains lang_domain_count = defaultdict(int) for langs in self.coverage_index.values(): for lang in langs: lang_domain_count[lang] += 1 reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain) # Remove the languages from the indexer if reject_langs: #print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs)) reject_ids = set(self.lang_index[l] for l in reject_langs) new_lang_index = defaultdict(Enumerator()) lm = dict() for k,v in self.lang_index.items(): if v not in reject_ids: new_id = new_lang_index[k] lm[v] = new_id # Eliminate all entries for the languages self.items = [ (d, lm[l], n, p) for (d, l, n, p) in self.items if l in lm] self.lang_index = new_lang_index
def select_LD_features(ig_lang, ig_domain, feats_per_lang, ignore_domain=False): """ @param ignore_domain boolean to indicate whether to use domain weights """ assert (ig_domain is None) or (len(ig_lang) == len(ig_domain)) num_lang = len(ig_lang.values()[0]) num_term = len(ig_lang) term_index = defaultdict(Enumerator()) ld = numpy.empty((num_lang, num_term), dtype=float) for term in ig_lang: term_id = term_index[term] if ignore_domain: ld[:, term_id] = ig_lang[term] else: ld[:, term_id] = ig_lang[term] - ig_domain[term] terms = sorted(term_index, key=term_index.get) # compile the final feature set selected_features = dict() for lang_id, lang_w in enumerate(ld): term_inds = numpy.argsort(lang_w)[-feats_per_lang:] selected_features[lang_id] = [terms[t] for t in term_inds] return selected_features
def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None, line_level=False): self.root = root self.min_domain = min_domain self.proportion = proportion if langs is None: self.lang_index = defaultdict(Enumerator()) else: # pre-specified lang set self.lang_index = dict((k, v) for v, k in enumerate(langs)) if domains is None: self.domain_index = defaultdict(Enumerator()) else: # pre-specified domain set self.domain_index = dict((k, v) for v, k in enumerate(domains)) self.coverage_index = defaultdict(set) self.items = list() if os.path.isdir(root): # root supplied was the root of a directory structure candidates = [] for dirpath, dirnames, filenames in os.walk(root, followlinks=True): for docname in filenames: candidates.append(os.path.join(dirpath, docname)) else: # root supplied was a file, interpet as list of paths candidates = map(str.strip, open(root)) if line_level: self.index_line(candidates) else: self.index(candidates) self.prune_min_domain(self.min_domain)
def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None, line_level=False): self.root = root self.min_domain = min_domain self.proportion = proportion if langs is None: self.lang_index = defaultdict(Enumerator()) else: # 预先制定了语料集合 self.lang_index = dict((k, v) for v, k in enumerate(langs)) if domains is None: self.domain_index = defaultdict(Enumerator()) else: # 预先定义了domain集合 self.domain_index = dict((k, v) for v, k in enumerate(domains)) self.coverage_index = defaultdict(set) self.items = list() if os.path.isdir(root): candidates = [] for dirpath, dirnames, filenames in os.walk(root, followlinks=True): for docname in filenames: candidates.append(os.path.join(dirpath, docname)) else: candidates = map(str.strip, open(root)) if line_level: self.index_line(candidates) else: self.index(candidates) self.prune_min_domain(self.min_domain)
def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None): self.root = root self.min_domain = min_domain self.proportion = proportion if langs is None: self.lang_index = defaultdict(Enumerator()) else: # pre-specified lang set self.lang_index = dict((k,v) for v,k in enumerate(langs)) if domains is None: self.domain_index = defaultdict(Enumerator()) else: # pre-specified domain set self.domain_index = dict((k,v) for v,k in enumerate(domains)) self.coverage_index = defaultdict(set) self.items = list() self.index(root) self.prune_min_domain(self.min_domain)
def prune_min_domain(self, min_domain): lang_domain_count = defaultdict(int) for langs in self.coverage_index.values(): for lang in langs: lang_domain_count[lang] += 1 reject_langs = set(l for l in lang_domain_count if lang_domain_count[l] < min_domain) # 从index中移除语言 if reject_langs: reject_ids = set(self.lang_index[l] for l in reject_langs) new_lang_index = defaultdict(Enumerator()) lm = dict() for k, v in self.lang_index.items(): if v not in reject_ids: new_id = new_lang_index[k] lm[v] = new_id # 消除语言的所有条目 self.items = [(d, lm[l], n, p) for (d, l, n, p) in self.items if l in lm] self.lang_index = new_lang_index
def pass_IG(buckets): """ In this pass we compute the information gain for each feature, binarized with respect to each language as well as unified over the set of all classes. @global __features the list of features to compute IG for @global __dist the background distribution @global __binarize (boolean) compute IG binarized per-class if True @global __suffix of files in bucketdir to process @param buckets a list of buckets. Each bucket must be a directory that contains files with the appropriate suffix. Each file must contain marshalled (term, event_id, count) triplets. """ global __features, __dist, __binarize, __suffix # We first tally the per-event frequency of each # term in our selected feature set. term_freq = defaultdict(lambda: defaultdict(int)) term_index = defaultdict(Enumerator()) for bucket in buckets: for path in os.listdir(bucket): if path.endswith(__suffix): for key, event_id, count in unmarshal_iter( os.path.join(bucket, path)): # Select only our listed features if key in __features: term_index[key] term_freq[key][event_id] += count num_term = len(term_index) num_event = len(__dist) cm_pos = numpy.zeros((num_term, num_event), dtype='int') for term, term_id in term_index.iteritems(): # update event matrix freq = term_freq[term] for event_id, count in freq.iteritems(): cm_pos[term_id, int(event_id)] = count cm_neg = __dist - cm_pos cm = numpy.dstack((cm_neg, cm_pos)) if not __binarize: # non-binarized event space x = cm.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) # Entropy of the term-present/term-absent events e = entropy(cm, axis=1) # Information Gain with respect to the set of events ig = entropy(__dist) - (term_w * e).sum(axis=1) else: # binarized event space # Compute IG binarized with respect to each event ig = list() for event_id in xrange(num_event): num_doc = __dist.sum() prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term)) cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :] cm_bin[:, 1, :] = cm[:, event_id, :] e = entropy(cm_bin, axis=1) x = cm_bin.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) ig.append(entropy(prior) - (term_w * e).sum(axis=1)) ig = numpy.vstack(ig) terms = sorted(term_index, key=term_index.get) return terms, ig
def pass_IG(buckets): global __features, __dist, __binarize, __suffix #首先计算所选特征集中每个特征的每个事件的频率 term_freq = defaultdict(lambda: defaultdict(int)) term_index = defaultdict(Enumerator()) for bucket in buckets: for path in os.listdir(bucket): if path.endswith(__suffix): for key, event_id, count in unmarshal_iter( os.path.join(bucket, path)): # 只选择列出的特征 if key in __features: term_index[key] term_freq[key][event_id] += count num_term = len(term_index) num_event = len(__dist) cm_pos = numpy.zeros((num_term, num_event), dtype='int') for term, term_id in term_index.iteritems(): # 更新事件矩阵 freq = term_freq[term] for event_id, count in freq.iteritems(): cm_pos[term_id, event_id] = count cm_neg = __dist - cm_pos cm = numpy.dstack((cm_neg, cm_pos)) if not __binarize: # 非二值化的事件空间 x = cm.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) #包含该术语与否的熵 e = entropy(cm, axis=1) # 计算到的IG ig = entropy(__dist) - (term_w * e).sum(axis=1) else: ig = list() for event_id in xrange(num_event): num_doc = __dist.sum() prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc cm_bin = numpy.zeros((num_term, 2, 2), dtype=int) # (term, p(term), p(lang|term)) cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :] cm_bin[:, 1, :] = cm[:, event_id, :] e = entropy(cm_bin, axis=1) x = cm_bin.sum(axis=1) term_w = x / x.sum(axis=1)[:, None].astype(float) ig.append(entropy(prior) - (term_w * e).sum(axis=1)) ig = numpy.vstack(ig) terms = sorted(term_index, key=term_index.get) return terms, ig