コード例 #1
0
ファイル: NBtrain.py プロジェクト: zaswed76/diareader
def pass_ptc(b_dir):
  """
  Take a bucket, form a feature map, compute the count of
  each feature in each class.
  @param b_dir path to the bucket directory
  @returns (read_count, f_ids, prod) 
  """
  global __cm, __num_instances, __chunk_offsets

  terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int'))

  read_count = 0
  for path in os.listdir(b_dir):
    if path.endswith('.index'):
      for f_id, chunk_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)):
        index = doc_id + __chunk_offsets[chunk_id]
        terms[f_id][index] = count
        read_count += 1

  f_ids, f_vs = zip(*terms.items())
  fm = np.vstack(f_vs)
  # The calculation of the term-class distribution is done per-chunk rather
  # than globally for memory efficiency reasons.
  prod = np.dot(fm, __cm)

  return read_count, f_ids, prod
コード例 #2
0
def pass_ptc(b_dir):
    """
  Take a bucket, form a feature map, compute the count of
  each feature in each class.
  @param b_dir path to the bucket directory
  @returns (read_count, f_ids, prod) 
  """
    global __cm, __num_instances, __chunk_offsets

    terms = defaultdict(lambda: np.zeros((__num_instances, ), dtype='int'))

    read_count = 0
    for path in os.listdir(b_dir):
        if path.endswith('.index'):
            for f_id, chunk_id, doc_id, count in unmarshal_iter(
                    os.path.join(b_dir, path)):
                index = doc_id + __chunk_offsets[chunk_id]
                terms[f_id][index] = count
                read_count += 1

    f_ids, f_vs = zip(*terms.items())
    fm = np.vstack(f_vs)
    # The calculation of the term-class distribution is done per-chunk rather
    # than globally for memory efficiency reasons.
    prod = np.dot(fm, __cm)

    return read_count, f_ids, prod
コード例 #3
0
ファイル: featureselect.py プロジェクト: saffsd/linguini.py
def pass_sum_lf(bucket):
  """
  Compute 'document frequency' as defined by Prager(1999) - the number of
  languages a term appears in.
  """
  term_langs = defaultdict(set)

  for path in os.listdir(bucket):
    if path.endswith('.lang'):
      for key, lang, _ in unmarshal_iter(os.path.join(bucket,path)):
        term_langs[key].add(lang)
  
  retval = dict( (k,len(v)) for k,v in term_langs.iteritems() ) 
  return retval
コード例 #4
0
ファイル: featureselect.py プロジェクト: saffsd/linguini.py
def pass_sum_lf(bucket):
    """
  Compute 'document frequency' as defined by Prager(1999) - the number of
  languages a term appears in.
  """
    term_langs = defaultdict(set)

    for path in os.listdir(bucket):
        if path.endswith('.lang'):
            for key, lang, _ in unmarshal_iter(os.path.join(bucket, path)):
                term_langs[key].add(lang)

    retval = dict((k, len(v)) for k, v in term_langs.iteritems())
    return retval
コード例 #5
0
def pass_sum_df(bucket):
  """
  Compute document frequency (df) by summing up (key,domain,count) triplets
  over all domains.
  """
  doc_count = defaultdict(int)
  count = 0
  with gzip.open(os.path.join(bucket, "docfreq"),'wb') as docfreq:
    for path in os.listdir(bucket):
      # We use the domain buckets as there are usually less domains
      if path.endswith('.domain'):
        for key, _, value in unmarshal_iter(os.path.join(bucket,path)):
          doc_count[key] += value
          count += 1
    
    for item in doc_count.iteritems():
      docfreq.write(marshal.dumps(item))
  return count
コード例 #6
0
ファイル: NBtrain.py プロジェクト: jiuyue99207/lan_id
def pass_ptc(b_dir):
    global __cm, __num_instances, __chunk_offsets
    terms = defaultdict(lambda: np.zeros((__num_instances, ), dtype='int'))

    read_count = 0
    for path in os.listdir(b_dir):
        if path.endswith('.index'):
            for f_id, chunk_id, doc_id, count in unmarshal_iter(
                    os.path.join(b_dir, path)):
                index = doc_id + __chunk_offsets[chunk_id]
                terms[f_id][index] = count
                read_count += 1

    f_ids, f_vs = zip(*terms.items())
    fm = np.vstack(f_vs)
    prod = np.dot(fm, __cm)

    return read_count, f_ids, prod
コード例 #7
0
ファイル: featureselect.py プロジェクト: saffsd/linguini.py
def pass_select(bucket):
    """
  Select features from a chunk that meet our selection criteria.
  """
    global __lang_count, __k

    # Compute the term-language frequency first
    term_lang_count = defaultdict(int)
    for path in os.listdir(bucket):
        if path.endswith('.lang'):
            for key, lang, value in unmarshal_iter(os.path.join(bucket, path)):
                term_lang_count[key, lang] += value

    features = set()
    for (term, lang), count in term_lang_count.iteritems():
        if count >= __k * __lang_count[term]:
            features.add(term)
    return features
コード例 #8
0
ファイル: featureselect.py プロジェクト: saffsd/linguini.py
def pass_select(bucket):
  """
  Select features from a chunk that meet our selection criteria.
  """
  global __lang_count, __k

  # Compute the term-language frequency first
  term_lang_count = defaultdict(int)
  for path in os.listdir(bucket):
    if path.endswith('.lang'):
      for key, lang, value in unmarshal_iter(os.path.join(bucket,path)):
        term_lang_count[key, lang] += value
    
  features = set()
  for (term, lang), count in term_lang_count.iteritems():
    if count >= __k * __lang_count[term]:
      features.add(term)
  return features
コード例 #9
0
ファイル: DFfeatureselect.py プロジェクト: 4line/langid.py
def pass_sum_df(bucket):
  """
  Compute document frequency (df) by summing up (key,domain,count) triplets
  over all domains.
  """
  doc_count = defaultdict(int)
  count = 0
  with gzip.open(os.path.join(bucket, "docfreq"),'wb') as docfreq:
    for path in os.listdir(bucket):
      # We use the domain buckets as there are usually less domains
      if path.endswith('.domain'):
        for key, _, value in unmarshal_iter(os.path.join(bucket,path)):
          doc_count[key] += value
          count += 1
    
    for item in doc_count.iteritems():
      docfreq.write(marshal.dumps(item))
  return count
コード例 #10
0
ファイル: DFfeatureselect.py プロジェクト: 4line/langid.py
def tally(bucketlist, jobs=None):
  """
  Sum up the counts for each feature across all buckets. This
  builds a full mapping of feature->count. This is stored in-memory
  and thus could be an issue for large feature sets.
  """

  with MapPool(jobs) as f:
    pass_sum_df_out = f(pass_sum_df, bucketlist)

    for i, keycount in enumerate(pass_sum_df_out):
      print "processed bucket (%d/%d) [%d keys]" % (i+1, len(bucketlist), keycount)

  # build the global term->df mapping
  doc_count = {}
  for bucket in bucketlist:
    for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')):
      doc_count[key] = value

  return doc_count
コード例 #11
0
def tally(bucketlist, jobs=None):
    """
  Sum up the counts for each feature across all buckets. This
  builds a full mapping of feature->count. This is stored in-memory
  and thus could be an issue for large feature sets.
  """

    with MapPool(jobs) as f:
        pass_sum_df_out = f(pass_sum_df, bucketlist)

        for i, keycount in enumerate(pass_sum_df_out):
            print "processed bucket (%d/%d) [%d keys]" % (
                i + 1, len(bucketlist), keycount)

    # build the global term->df mapping
    doc_count = {}
    for bucket in bucketlist:
        for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')):
            doc_count[key] = value

    return doc_count
コード例 #12
0
def pass_fm(b_dir):
  """
  Take a bucket, form a feature map, compute the count of
  each feature in each class.
  @param b_dir path to the bucket directory
  @returns (read_count, f_ids, prod) 
  """
  global __num_instances, __chunk_offsets

  terms = defaultdict(lambda : np.zeros((__num_instances,), dtype='int'))

  read_count = 0
  for path in os.listdir(b_dir):
    if path.endswith('.index'):
      for f_id, chunk_id, doc_id, count in unmarshal_iter(os.path.join(b_dir, path)):
        index = doc_id + __chunk_offsets[chunk_id]
        terms[f_id][index] = count
        read_count += 1

  f_ids, f_vs = zip(*terms.items())
  fm = np.vstack(f_vs)
  return read_count, f_ids, fm
コード例 #13
0
ファイル: featureselect.py プロジェクト: saffsd/linguini.py
def pass_tfilf(bucket):
  """
  Select top-n features from a chunk by Prager's TFILF criteria.
  """
  global __lang_count, __count

  # Compute the term-language frequency first
  term_lang_count = defaultdict(lambda:defaultdict(int))
  for path in os.listdir(bucket):
    if path.endswith('.lang'):
      for key, lang, value in unmarshal_iter(os.path.join(bucket,path)):
        term_lang_count[key][lang] += value
    
  def f_iter():
    for term in term_lang_count:
      lf = float(__lang_count[term])
      tf = max(term_lang_count[term].values()) # most frequent language
      rval = random.random() # this is used to randomize the order of the same-score items
      yield (tf/lf, rval, term) # TF-ILF score as described by Prager


  retval = heapq.nlargest(__count, f_iter())
  return retval
コード例 #14
0
ファイル: featureselect.py プロジェクト: saffsd/linguini.py
def pass_tfilf(bucket):
    """
  Select top-n features from a chunk by Prager's TFILF criteria.
  """
    global __lang_count, __count

    # Compute the term-language frequency first
    term_lang_count = defaultdict(lambda: defaultdict(int))
    for path in os.listdir(bucket):
        if path.endswith('.lang'):
            for key, lang, value in unmarshal_iter(os.path.join(bucket, path)):
                term_lang_count[key][lang] += value

    def f_iter():
        for term in term_lang_count:
            lf = float(__lang_count[term])
            tf = max(term_lang_count[term].values())  # most frequent language
            rval = random.random(
            )  # this is used to randomize the order of the same-score items
            yield (tf / lf, rval, term)  # TF-ILF score as described by Prager

    retval = heapq.nlargest(__count, f_iter())
    return retval
コード例 #15
0
def pass_IG(bucket):
    """
  In this pass we compute the information gain for each feature, binarized 
  with respect to each language as well as unified over the set of all 
  classes. 

  @global __features the list of features to compute IG for
  @global __dist the background distribution
  @global __binarize (boolean) compute IG binarized per-class if True
  @global __suffix of files in bucketdir to process
  @param bucket the bucket file to process. It is assumed to contain marshalled (term, event_id, count) triplets.
  """
    global __features, __dist, __binarize, __suffix

    # We first tally the per-event frequency of each
    # term in our selected feature set.
    term_freq = defaultdict(lambda: defaultdict(int))
    term_index = defaultdict(Enumerator())

    for path in os.listdir(bucket):
        if path.endswith(__suffix):
            for key, event_id, count in unmarshal_iter(os.path.join(bucket, path)):
                # Select only our listed features
                if key in __features:
                    term_index[key]
                    term_freq[key][event_id] += count

    num_term = len(term_index)
    num_event = len(__dist)

    cm_pos = numpy.zeros((num_term, num_event), dtype="int")

    for term, term_id in term_index.iteritems():
        # update event matrix
        freq = term_freq[term]
        for event_id, count in freq.iteritems():
            cm_pos[term_id, event_id] = count
    cm_neg = __dist - cm_pos
    cm = numpy.dstack((cm_neg, cm_pos))

    if not __binarize:
        # non-binarized event space
        x = cm.sum(axis=1)
        term_w = x / x.sum(axis=1)[:, None].astype(float)

        # Entropy of the term-present/term-absent events
        e = entropy(cm, axis=1)

        # Information Gain with respect to the set of events
        ig = entropy(__dist) - (term_w * e).sum(axis=1)

    else:
        # binarized event space
        # Compute IG binarized with respect to each event
        ig = list()
        for event_id in xrange(num_event):
            num_doc = __dist.sum()
            prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]), dtype=float) / num_doc

            cm_bin = numpy.zeros((num_term, 2, 2), dtype=int)  # (term, p(term), p(lang|term))
            cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :]
            cm_bin[:, 1, :] = cm[:, event_id, :]

            e = entropy(cm_bin, axis=1)
            x = cm_bin.sum(axis=1)
            term_w = x / x.sum(axis=1)[:, None].astype(float)

            ig.append(entropy(prior) - (term_w * e).sum(axis=1))
        ig = numpy.vstack(ig)

    terms = sorted(term_index, key=term_index.get)
    return terms, ig
コード例 #16
0
ファイル: IGweight.py プロジェクト: Esiravegna/langid.py
def pass_IG(buckets):
    """
  In this pass we compute the information gain for each feature, binarized 
  with respect to each language as well as unified over the set of all 
  classes. 

  @global __features the list of features to compute IG for
  @global __dist the background distribution
  @global __binarize (boolean) compute IG binarized per-class if True
  @global __suffix of files in bucketdir to process
  @param buckets a list of buckets. Each bucket must be a directory that contains files 
                 with the appropriate suffix. Each file must contain marshalled 
                 (term, event_id, count) triplets.
  """
    global __features, __dist, __binarize, __suffix

    # We first tally the per-event frequency of each
    # term in our selected feature set.
    term_freq = defaultdict(lambda: defaultdict(int))
    term_index = defaultdict(Enumerator())

    for bucket in buckets:
        for path in os.listdir(bucket):
            if path.endswith(__suffix):
                for key, event_id, count in unmarshal_iter(
                        os.path.join(bucket, path)):
                    # Select only our listed features
                    if key in __features:
                        term_index[key]
                        term_freq[key][event_id] += count

    num_term = len(term_index)
    num_event = len(__dist)

    cm_pos = numpy.zeros((num_term, num_event), dtype='int')

    for term, term_id in term_index.iteritems():
        # update event matrix
        freq = term_freq[term]
        for event_id, count in freq.iteritems():
            cm_pos[term_id, int(event_id)] = count
    cm_neg = __dist - cm_pos
    cm = numpy.dstack((cm_neg, cm_pos))

    if not __binarize:
        # non-binarized event space
        x = cm.sum(axis=1)
        term_w = x / x.sum(axis=1)[:, None].astype(float)

        # Entropy of the term-present/term-absent events
        e = entropy(cm, axis=1)

        # Information Gain with respect to the set of events
        ig = entropy(__dist) - (term_w * e).sum(axis=1)

    else:
        # binarized event space
        # Compute IG binarized with respect to each event
        ig = list()
        for event_id in xrange(num_event):
            num_doc = __dist.sum()
            prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]),
                                dtype=float) / num_doc

            cm_bin = numpy.zeros((num_term, 2, 2),
                                 dtype=int)  # (term, p(term), p(lang|term))
            cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :]
            cm_bin[:, 1, :] = cm[:, event_id, :]

            e = entropy(cm_bin, axis=1)
            x = cm_bin.sum(axis=1)
            term_w = x / x.sum(axis=1)[:, None].astype(float)

            ig.append(entropy(prior) - (term_w * e).sum(axis=1))
        ig = numpy.vstack(ig)

    terms = sorted(term_index, key=term_index.get)
    return terms, ig
コード例 #17
0
def pass_IG(buckets):
    global __features, __dist, __binarize, __suffix
    #首先计算所选特征集中每个特征的每个事件的频率
    term_freq = defaultdict(lambda: defaultdict(int))
    term_index = defaultdict(Enumerator())

    for bucket in buckets:
        for path in os.listdir(bucket):
            if path.endswith(__suffix):
                for key, event_id, count in unmarshal_iter(
                        os.path.join(bucket, path)):
                    # 只选择列出的特征
                    if key in __features:
                        term_index[key]
                        term_freq[key][event_id] += count

    num_term = len(term_index)
    num_event = len(__dist)

    cm_pos = numpy.zeros((num_term, num_event), dtype='int')

    for term, term_id in term_index.iteritems():
        # 更新事件矩阵
        freq = term_freq[term]
        for event_id, count in freq.iteritems():
            cm_pos[term_id, event_id] = count
    cm_neg = __dist - cm_pos
    cm = numpy.dstack((cm_neg, cm_pos))

    if not __binarize:
        # 非二值化的事件空间
        x = cm.sum(axis=1)
        term_w = x / x.sum(axis=1)[:, None].astype(float)

        #包含该术语与否的熵
        e = entropy(cm, axis=1)

        # 计算到的IG
        ig = entropy(__dist) - (term_w * e).sum(axis=1)

    else:
        ig = list()
        for event_id in xrange(num_event):
            num_doc = __dist.sum()
            prior = numpy.array((num_doc - __dist[event_id], __dist[event_id]),
                                dtype=float) / num_doc

            cm_bin = numpy.zeros((num_term, 2, 2),
                                 dtype=int)  # (term, p(term), p(lang|term))
            cm_bin[:, 0, :] = cm.sum(axis=1) - cm[:, event_id, :]
            cm_bin[:, 1, :] = cm[:, event_id, :]

            e = entropy(cm_bin, axis=1)
            x = cm_bin.sum(axis=1)
            term_w = x / x.sum(axis=1)[:, None].astype(float)

            ig.append(entropy(prior) - (term_w * e).sum(axis=1))
        ig = numpy.vstack(ig)

    terms = sorted(term_index, key=term_index.get)
    return terms, ig