예제 #1
0
def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
    global b_dirs
    num_instances = len(paths)
    num_features = max(i for v in tk_output.values() for i in v) + 1

    # Generate the feature map
    nm_arr = mp.Array('i', tk_nextmove, lock=False)

    if args.jobs:
        chunksize = min(len(paths) / (args.jobs * 2), args.chunksize)
    else:
        chunksize = min(len(paths) / (mp.cpu_count() * 2), args.chunksize)

    # TODO: Set the output dir
    b_dirs = [
        tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path)
        for i in range(args.buckets)
    ]

    output_states = set(tk_output)

    path_chunks = list(chunk(paths, chunksize))
    pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)

    pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs)
    with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
        pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

    write_count = sum(pass_tokenize_out)
    if not SILENT:
        print "wrote a total of %d keys" % write_count

    pass_ptc_params = (cm, num_instances)
    with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
        pass_ptc_out = f(pass_ptc, b_dirs)

    reads, ids, prods = zip(*pass_ptc_out)
    read_count = sum(reads)
    if not SILENT:
        print "read a total of %d keys (%d short)" % (read_count,
                                                      write_count - read_count)

    prod = np.zeros((num_features, cm.shape[1]), dtype=int)
    prod[np.concatenate(ids)] = np.vstack(prods)
    ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

    nb_ptc = array.array('d')
    for term_dist in ptc.tolist():
        nb_ptc.extend(term_dist)
    return nb_ptc
예제 #2
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE,
                sample_count=None,
                sample_size=None,
                term_freq=False,
                line_level=False):
    """
  @param items a list of (domain, language, path) tuples
  """
    global b_dirs, complete

    # Our exitfunc uses this to know whether to delete the tokenized files
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        os.path.join(outdir, "bucket{0}".format(i)) for i in range(buckets)
    ]

    for d in b_dirs:
        os.mkdir(d)

    # PASS 1: Tokenize documents into sets of terms

    # If there are few items, make the chunk size such that each job
    # will have 2 chunks
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size,
                             term_freq, line_level)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)

        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
        print "job count: {0}".format(jobs)

        if sample_count:
            print "sampling-based tokenization: size {0} count {1}".format(
                sample_size, sample_count)
        else:
            print "whole-document tokenization"

        for i, keycount in enumerate(pass_tokenize_out):
            print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count,
                                                         keycount)

    complete = True

    return b_dirs
예제 #3
0
파일: model.py 프로젝트: saffsd/linguini.py
def learn_ftc(paths, tk_nextmove, tk_output, cm, temp_path, args):
    global b_dirs
    num_instances = len(paths)
    num_features = max(i for v in tk_output.values() for i in v) + 1

    # Generate the feature map
    nm_arr = mp.Array('i', tk_nextmove, lock=False)

    if args.jobs:
        chunksize = min(len(paths) / (args.jobs * 2), args.chunksize)
    else:
        chunksize = min(len(paths) / (mp.cpu_count() * 2), args.chunksize)

    # TODO: Set the output dir
    b_dirs = [
        tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path)
        for i in range(args.buckets)
    ]

    output_states = set(tk_output)

    path_chunks = list(chunk(paths, chunksize))
    pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)

    pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs)
    with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
        pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

    write_count = sum(pass_tokenize_out)
    logger.info("wrote a total of %d keys", write_count)

    # TODO: Report on the progress of this pass
    pass_ftc_params = (cm, num_instances)
    with MapPool(args.jobs, setup_pass_ftc, pass_ftc_params) as f:
        pass_ftc_out = f(pass_ftc, b_dirs)

    reads, ids, prods = zip(*pass_ftc_out)
    read_count = sum(reads)
    logger.info("read a total of %d keys (%d short)", read_count,
                write_count - read_count)

    # Re-order the weights into a single ndarray
    term_lang_counts = np.zeros((num_features, cm.shape[1]), dtype=int)
    term_lang_counts[np.concatenate(ids)] = np.vstack(prods)
    return term_lang_counts
예제 #4
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE,
                sample_count=None,
                sample_size=None,
                term_freq=False,
                line_level=False):
    global b_dirs, complete

    #判断是否删除标记的文件
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        os.path.join(outdir, "bucket{0}".format(i)) for i in range(buckets)
    ]

    for d in b_dirs:
        os.mkdir(d)

    # PASS 1: 将文档分为几组
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size,
                             term_freq, line_level)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)
        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
        print "job count: {0}".format(jobs)
        if sample_count:
            print "sampling-based tokenization: size {0} count {1}".format(
                sample_size, sample_count)
        else:
            print "whole-document tokenization"

        for i, keycount in enumerate(pass_tokenize_out):
            print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count,
                                                         keycount)

    complete = True

    return b_dirs
예제 #5
0
def tally_lf(bucketlist, jobs=None):
    """
  Sum up k,v pairs across all buckets. This builds a global mapping of
  terms to the number of languages the terms occur in
  """

    lang_count = {}
    with MapPool(jobs) as f:
        pass_sum_lf_out = f(pass_sum_lf, bucketlist)

        for i, v in enumerate(pass_sum_lf_out):
            lang_count.update(v)
            logger.debug("processed bucket ({0}/{1}) [{2} terms]".format(
                i + 1, len(bucketlist), len(v)))

    return lang_count
예제 #6
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE):
    """
  @param items a list of (domain, language, path) tuples
  """
    global b_dirs, complete

    # Our exitfunc uses this to know whether to delete the tokenized files
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        tempfile.mkdtemp(prefix="tokenize-",
                         suffix='-{0}'.format(tokenizer.__class__.__name__),
                         dir=outdir) for i in range(buckets)
    ]

    # PASS 1: Tokenize documents into sets of terms

    # If there are few items, make the chunk size such that each job
    # will have 2 chunks
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)

        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
        print "job count: {0}".format(jobs)

        for i, keycount in enumerate(pass_tokenize_out):
            print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count,
                                                         keycount)

    complete = True

    return b_dirs
예제 #7
0
def prager_select(bucketlist, lang_count, k, jobs=None):
    """
  Compute the feature selection score according to Prager (1999).
  This is basically a tf-idf computation (where the 'df' used is
  number of languages a term occurs in rather than number of training
  documents). 

  @param k threshold value for selection. We select when score > k
  """
    features = set()
    with MapPool(jobs, setup_pass_select, (lang_count, k)) as f:
        pass_select_out = f(pass_select, bucketlist)

        for i, feats in enumerate(pass_select_out):
            features |= feats
            logger.debug("processed bucket ({0}/{1}) [selected {2}]".format(
                i + 1, len(bucketlist), len(feats)))

    return features
예제 #8
0
def tally(bucketlist, jobs=None):
    """
  Sum up the counts for each feature across all buckets. This
  builds a full mapping of feature->count. This is stored in-memory
  and thus could be an issue for large feature sets.
  """

    with MapPool(jobs) as f:
        pass_sum_df_out = f(pass_sum_df, bucketlist)

        for i, keycount in enumerate(pass_sum_df_out):
            print "processed bucket (%d/%d) [%d keys]" % (
                i + 1, len(bucketlist), keycount)

    # build the global term->df mapping
    doc_count = {}
    for bucket in bucketlist:
        for key, value in unmarshal_iter(os.path.join(bucket, 'docfreq')):
            doc_count[key] = value

    return doc_count
예제 #9
0
def compute_IG(bucketlist, features, dist, binarize, suffix, job_count=None):
  pass_IG_args = (features, dist, binarize, suffix)

  num_chunk = len(bucketlist)
  weights = []
  terms = []

  with MapPool(job_count, setup_pass_IG, pass_IG_args) as f:
    pass_IG_out = f(pass_IG, bucketlist)

    for i, (t, w) in enumerate(pass_IG_out):
      weights.append(w)
      terms.extend(t)
      print "processed chunk (%d/%d) [%d terms]" % (i+1, num_chunk, len(t))

  if binarize:
    weights = numpy.hstack(weights).transpose()
  else:
    weights = numpy.concatenate(weights)
  terms = ["".join(t) for t in terms]

  return zip(terms, weights)
예제 #10
0
def tfilf_select(bucketlist, lang_count, count, jobs=None):
    """
  Do a feature selection based on the top-N features, using the same
  scoring as Prager but with a fixed number of features rather than
  a floating number as selected by K. We optimize slightly by
  observing that `count`, the total number to select, can in the most
  corner of cases only come from 1 bucket; hence, we select `count`
  from each bucket, then the top `count` thereof.
  """
    features = []
    with MapPool(jobs, setup_pass_tfilf, (
            lang_count,
            count,
    )) as f:
        pass_tfilf_out = f(pass_tfilf, bucketlist)

        for i, feats in enumerate(pass_tfilf_out):
            # Keep selecting n-largest from the previous output and the new candidates
            features = heapq.nlargest(count, itertools.chain(features, feats))
            logger.debug("processed bucket ({0}/{1})".format(
                i + 1, len(bucketlist)))

    return [f for c, r, f in sorted(features, reverse=True)]
예제 #11
0
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
  """
  @param items label, path pairs
  """
  global outdir

  # Generate the feature map
  nm_arr = mp.Array('i', tk_nextmove, lock=False)

  if args.jobs:
    tasks = args.jobs * 2
  else:
    tasks = mp.cpu_count() * 2

  # Ensure chunksize of at least 1, but not exceeding specified chunksize
  chunksize = max(1, min(len(items) / tasks, args.chunksize))

  outdir = tempfile.mkdtemp(prefix="NBtrain-",suffix='-buckets', dir=temp_path)
  b_dirs = [ os.path.join(outdir,"bucket{0}".format(i)) for i in range(args.buckets) ]

  for d in b_dirs:
    os.mkdir(d)

  output_states = set(tk_output)
  
  # Divide all the items to be processed into chunks, and enumerate each chunk.
  item_chunks = list(chunk(items, chunksize))
  pass_tokenize_arg = enumerate(item_chunks)
  
  pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs, args.line) 
  with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
    pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

  write_count = 0
  chunk_sizes = {}
  labels = []
  for chunk_id, doc_count, writes, _labels in pass_tokenize_out:
    write_count += writes
    chunk_sizes[chunk_id] = doc_count
    labels.extend(_labels)

  print "wrote a total of %d keys" % write_count

  num_instances = sum(chunk_sizes.values())
  print "processed a total of %d instances" % num_instances

  chunk_offsets = {}
  for i in range(len(chunk_sizes)):
    chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i))
    print "  offset for chunk {0} is {1}".format(i, chunk_offsets[i])

  pass_fm_params = (num_instances, chunk_offsets)
  with MapPool(args.jobs, setup_pass_fm, pass_fm_params) as f:
    pass_fm_out = f(pass_fm, b_dirs)

  reads, ids, fms = zip(*pass_fm_out)
  read_count = sum(reads)
  print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)

  num_features = max( i for v in tk_output.values() for i in v) + 1
  fm = np.zeros((num_features, num_instances), dtype=int)
  fm[np.concatenate(ids)] = np.vstack(fms)

  print "have {} labels".format(len(labels))
  cm = np.zeros((num_instances, num_langs), dtype='bool')
  for doc_id, lang_id in enumerate(labels):
    cm[doc_id, lang_id] = True

  # This is where the smoothing occurs
  prod = np.dot(fm, cm)
  ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

  nb_ptc = array.array('d')
  for term_dist in ptc.tolist():
    nb_ptc.extend(term_dist)

  pc = np.log(cm.sum(0))
  nb_pc = array.array('d', pc)

  return nb_pc, nb_ptc
예제 #12
0
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
    """
  @param items label, path pairs
  """
    global outdir

    print "learning NB parameters on {} items".format(len(items))

    # Generate the feature map
    nm_arr = mp.Array('i', tk_nextmove, lock=False)

    if args.jobs:
        tasks = args.jobs * 2
    else:
        tasks = mp.cpu_count() * 2

    # Ensure chunksize of at least 1, but not exceeding specified chunksize
    chunksize = max(1, min(len(items) / tasks, args.chunksize))

    outdir = tempfile.mkdtemp(prefix="NBtrain-",
                              suffix='-buckets',
                              dir=temp_path)
    b_dirs = [
        os.path.join(outdir, "bucket{0}".format(i))
        for i in range(args.buckets)
    ]

    for d in b_dirs:
        os.mkdir(d)

    output_states = set(tk_output)

    # Divide all the items to be processed into chunks, and enumerate each chunk.
    item_chunks = list(chunk(items, chunksize))
    num_chunks = len(item_chunks)
    print "about to tokenize {} chunks".format(num_chunks)

    pass_tokenize_arg = enumerate(item_chunks)
    pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs,
                            args.line)
    with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
        pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

        write_count = 0
        chunk_sizes = {}
        chunk_labels = []
        for i, (chunk_id, doc_count, writes,
                labels) in enumerate(pass_tokenize_out):
            write_count += writes
            chunk_sizes[chunk_id] = doc_count
            chunk_labels.append((chunk_id, labels))
            print "processed chunk ID:{0} ({1}/{2}) [{3} keys]".format(
                chunk_id, i + 1, num_chunks, writes)

    print "wrote a total of %d keys" % write_count

    num_instances = sum(chunk_sizes.values())
    print "processed a total of %d instances" % num_instances

    chunk_offsets = {}
    for i in range(len(chunk_sizes)):
        chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i))

    # Build CM based on re-ordeing chunk
    cm = np.zeros((num_instances, num_langs), dtype='bool')
    for chunk_id, chunk_label in chunk_labels:
        for doc_id, lang_id in enumerate(chunk_label):
            index = doc_id + chunk_offsets[chunk_id]
            cm[index, lang_id] = True

    pass_ptc_params = (cm, num_instances, chunk_offsets)
    with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
        pass_ptc_out = f(pass_ptc, b_dirs)

        def pass_ptc_progress():
            for i, v in enumerate(pass_ptc_out):
                yield v
                print "processed chunk ({0}/{1})".format(i + 1, len(b_dirs))

        reads, ids, prods = zip(*pass_ptc_progress())
        read_count = sum(reads)
        print "read a total of %d keys (%d short)" % (read_count,
                                                      write_count - read_count)

    num_features = max(i for v in tk_output.values() for i in v) + 1
    prod = np.zeros((num_features, cm.shape[1]), dtype=int)
    prod[np.concatenate(ids)] = np.vstack(prods)

    # This is where the smoothing occurs
    ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

    nb_ptc = array.array('d')
    for term_dist in ptc.tolist():
        nb_ptc.extend(term_dist)

    pc = np.log(cm.sum(0))
    nb_pc = array.array('d', pc)

    return nb_pc, nb_ptc
예제 #13
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE,
                sample_count=None,
                sample_size=None):
    """
  @param items a list of (domain, language, path) tuples
  """
    global b_dirs, complete

    # Our exitfunc uses this to know whether to delete the tokenized files
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        tempfile.mkdtemp(prefix="tokenize-",
                         suffix='-{0}'.format(tokenizer.__class__.__name__),
                         dir=outdir) for i in range(buckets)
    ]

    # PASS 1: Tokenize documents into sets of terms

    # If there are few items, make the chunk size such that each job
    # will have 2 chunks
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)

        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        logger.info("chunk size: {0} ({1} chunks)".format(
            chunk_size, chunk_count))
        logger.info("job count: {0}".format(jobs))

        if sample_count:
            logger.info(
                "sampling-based tokenization: size {0} count {1}".format(
                    sample_size, sample_count))
        else:
            logger.info("whole-document tokenization")

        total_bytes = 0
        for i, chunk_bytes in enumerate(pass_tokenize_out):
            logger.debug("tokenized chunk (%d/%d) [%d bytes]" %
                         (i + 1, chunk_count, chunk_bytes))
            total_bytes += chunk_bytes

    logger.info("tokenized a total of {0} MB".format(total_bytes / 1024 /
                                                     1024))

    complete = True

    return b_dirs