Ejemplo n.º 1
0
  def prune_min_domain(self, min_domain):
    # prune files for all languages that do not occur in at least min_domain 
     
    # Work out which languages to reject as they are not present in at least 
    # the required number of domains
    lang_domain_count = defaultdict(int)
    for langs in self.coverage_index.values():
      for lang in langs:
        lang_domain_count[lang] += 1
    reject_langs = set( l for l in lang_domain_count if lang_domain_count[l] < min_domain)

    # Remove the languages from the indexer
    if reject_langs:
      #print "reject (<{0} domains): {1}".format(min_domain, sorted(reject_langs))
      reject_ids = set(self.lang_index[l] for l in reject_langs)
    
      new_lang_index = defaultdict(Enumerator())
      lm = dict()
      for k,v in self.lang_index.items():
        if v not in reject_ids:
          new_id = new_lang_index[k]
          lm[v] = new_id

      # Eliminate all entries for the languages
      self.items = [ (d, lm[l], n, p) for (d, l, n, p) in self.items if l in lm]

      self.lang_index = new_lang_index
Ejemplo n.º 2
0
def select_LD_features(ig_lang,
                       ig_domain,
                       feats_per_lang,
                       ignore_domain=False):
    """
  @param ignore_domain boolean to indicate whether to use domain weights
  """
    assert (ig_domain is None) or (len(ig_lang) == len(ig_domain))
    num_lang = len(ig_lang.values()[0])
    num_term = len(ig_lang)

    term_index = defaultdict(Enumerator())

    ld = numpy.empty((num_lang, num_term), dtype=float)

    for term in ig_lang:
        term_id = term_index[term]
        if ignore_domain:
            ld[:, term_id] = ig_lang[term]
        else:
            ld[:, term_id] = ig_lang[term] - ig_domain[term]

    terms = sorted(term_index, key=term_index.get)
    # compile the final feature set
    selected_features = dict()
    for lang_id, lang_w in enumerate(ld):
        term_inds = numpy.argsort(lang_w)[-feats_per_lang:]
        selected_features[lang_id] = [terms[t] for t in term_inds]

    return selected_features
Ejemplo n.º 3
0
    def __init__(self,
                 root,
                 min_domain=MIN_DOMAIN,
                 proportion=TRAIN_PROP,
                 langs=None,
                 domains=None,
                 line_level=False):
        self.root = root
        self.min_domain = min_domain
        self.proportion = proportion

        if langs is None:
            self.lang_index = defaultdict(Enumerator())
        else:
            # pre-specified lang set
            self.lang_index = dict((k, v) for v, k in enumerate(langs))

        if domains is None:
            self.domain_index = defaultdict(Enumerator())
        else:
            # pre-specified domain set
            self.domain_index = dict((k, v) for v, k in enumerate(domains))

        self.coverage_index = defaultdict(set)
        self.items = list()

        if os.path.isdir(root):
            # root supplied was the root of a directory structure
            candidates = []
            for dirpath, dirnames, filenames in os.walk(root,
                                                        followlinks=True):
                for docname in filenames:
                    candidates.append(os.path.join(dirpath, docname))
        else:
            # root supplied was a file, interpet as list of paths
            candidates = map(str.strip, open(root))

        if line_level:
            self.index_line(candidates)
        else:
            self.index(candidates)

        self.prune_min_domain(self.min_domain)
Ejemplo n.º 4
0
    def __init__(self,
                 root,
                 min_domain=MIN_DOMAIN,
                 proportion=TRAIN_PROP,
                 langs=None,
                 domains=None,
                 line_level=False):
        self.root = root
        self.min_domain = min_domain
        self.proportion = proportion

        if langs is None:
            self.lang_index = defaultdict(Enumerator())
        else:
            # 预先制定了语料集合
            self.lang_index = dict((k, v) for v, k in enumerate(langs))

        if domains is None:
            self.domain_index = defaultdict(Enumerator())
        else:
            # 预先定义了domain集合
            self.domain_index = dict((k, v) for v, k in enumerate(domains))

        self.coverage_index = defaultdict(set)
        self.items = list()

        if os.path.isdir(root):
            candidates = []
            for dirpath, dirnames, filenames in os.walk(root,
                                                        followlinks=True):
                for docname in filenames:
                    candidates.append(os.path.join(dirpath, docname))
        else:
            candidates = map(str.strip, open(root))

        if line_level:
            self.index_line(candidates)
        else:
            self.index(candidates)

        self.prune_min_domain(self.min_domain)
Ejemplo n.º 5
0
  def __init__(self, root, min_domain=MIN_DOMAIN, proportion=TRAIN_PROP, langs=None, domains=None):
    self.root = root
    self.min_domain = min_domain
    self.proportion = proportion 

    if langs is None:
      self.lang_index = defaultdict(Enumerator())
    else:
      # pre-specified lang set
      self.lang_index = dict((k,v) for v,k in enumerate(langs))

    if domains is None:
      self.domain_index = defaultdict(Enumerator())
    else:
      # pre-specified domain set
      self.domain_index = dict((k,v) for v,k in enumerate(domains))

    self.coverage_index = defaultdict(set)
    self.items = list()

    self.index(root)
    self.prune_min_domain(self.min_domain)
Ejemplo n.º 6
0
    def prune_min_domain(self, min_domain):
        lang_domain_count = defaultdict(int)
        for langs in self.coverage_index.values():
            for lang in langs:
                lang_domain_count[lang] += 1
        reject_langs = set(l for l in lang_domain_count
                           if lang_domain_count[l] < min_domain)

        # 从index中移除语言
        if reject_langs:
            reject_ids = set(self.lang_index[l] for l in reject_langs)
            new_lang_index = defaultdict(Enumerator())
            lm = dict()
            for k, v in self.lang_index.items():
                if v not in reject_ids:
                    new_id = new_lang_index[k]
                    lm[v] = new_id
            # 消除语言的所有条目
            self.items = [(d, lm[l], n, p) for (d, l, n, p) in self.items
                          if l in lm]

            self.lang_index = new_lang_index
Ejemplo n.º 7
0
def pass_IG(buckets):
    """
  In this pass we compute the information gain for each feature, binarized 
  with respect to each language as well as unified over the set of all 
  classes. 

  @global __features the list of features to compute IG for
  @global __dist the background distribution
  @global __binarize (boolean) compute IG binarized per-class if True
  @global __suffix of files in bucketdir to process
  @param buckets a list of buckets. Each bucket must be a directory that contains files 
                 with the appropriate suffix. Each file must contain marshalled 
                 (term, event_id, count) triplets.
  """
    global __features, __dist, __binarize, __suffix

    # We first tally the per-event frequency of each
    # term in our selected feature set.
    term_freq = defaultdict(lambda: defaultdict(int))
    term_index = defaultdict(Enumerator())

    for bucket in buckets:
        for path in os.listdir(bucket):
            if path.endswith(__suffix):
                for key, event_id, count in unmarshal_iter(
                        os.path.join(bucket, path)):
                    # Select only our listed features
                    if key in __features:
                        term_index[key]
                        term_freq[key][event_id] += count

    num_term = len(term_index)
    num_event = len(__dist)

    cm_pos = numpy.zeros((num_term, num_event), dtype='int')

    for term, term_id in term_index.iteritems():
        # update event matrix
        freq = term_freq[term]
        for event_id, count in freq.iteritems():
            cm_pos[term_id, int(event_id)] = count
    cm_neg = __dist - cm_pos
    cm = numpy.dstack((cm_neg, cm_pos))

    if not __binarize:
        # non-binarized event space
        x = cm.sum(axis=1)
        term_w = x / x.sum(axis=1)[:, None].astype(float)

        # Entropy of the term-present/term-absent events
        e = entropy(cm, axis=1)

        # Information Gain with respect to the set of events
        ig = entropy(__dist) - (term_w * e).sum(axis=1)

    else:
        # binarized event space
        # Compute IG binarized with respect to each event
        ig = list()
        for event_id in xrange(num_event):
            num_doc = __dist.sum()
            prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]),
                                dtype=float) / num_doc

            cm_bin = numpy.zeros((num_term, 2, 2),
                                 dtype=int)  # (term, p(term), p(lang|term))
            cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :]
            cm_bin[:, 1, :] = cm[:, event_id, :]

            e = entropy(cm_bin, axis=1)
            x = cm_bin.sum(axis=1)
            term_w = x / x.sum(axis=1)[:, None].astype(float)

            ig.append(entropy(prior) - (term_w * e).sum(axis=1))
        ig = numpy.vstack(ig)

    terms = sorted(term_index, key=term_index.get)
    return terms, ig
Ejemplo n.º 8
0
def pass_IG(buckets):
    global __features, __dist, __binarize, __suffix
    #首先计算所选特征集中每个特征的每个事件的频率
    term_freq = defaultdict(lambda: defaultdict(int))
    term_index = defaultdict(Enumerator())

    for bucket in buckets:
        for path in os.listdir(bucket):
            if path.endswith(__suffix):
                for key, event_id, count in unmarshal_iter(
                        os.path.join(bucket, path)):
                    # 只选择列出的特征
                    if key in __features:
                        term_index[key]
                        term_freq[key][event_id] += count

    num_term = len(term_index)
    num_event = len(__dist)

    cm_pos = numpy.zeros((num_term, num_event), dtype='int')

    for term, term_id in term_index.iteritems():
        # 更新事件矩阵
        freq = term_freq[term]
        for event_id, count in freq.iteritems():
            cm_pos[term_id, event_id] = count
    cm_neg = __dist - cm_pos
    cm = numpy.dstack((cm_neg, cm_pos))

    if not __binarize:
        # 非二值化的事件空间
        x = cm.sum(axis=1)
        term_w = x / x.sum(axis=1)[:, None].astype(float)

        #包含该术语与否的熵
        e = entropy(cm, axis=1)

        # 计算到的IG
        ig = entropy(__dist) - (term_w * e).sum(axis=1)

    else:
        ig = list()
        for event_id in xrange(num_event):
            num_doc = __dist.sum()
            prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]),
                                dtype=float) / num_doc

            cm_bin = numpy.zeros((num_term, 2, 2),
                                 dtype=int)  # (term, p(term), p(lang|term))
            cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :]
            cm_bin[:, 1, :] = cm[:, event_id, :]

            e = entropy(cm_bin, axis=1)
            x = cm_bin.sum(axis=1)
            term_w = x / x.sum(axis=1)[:, None].astype(float)

            ig.append(entropy(prior) - (term_w * e).sum(axis=1))
        ig = numpy.vstack(ig)

    terms = sorted(term_index, key=term_index.get)
    return terms, ig