Beispiel #1
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE,
                sample_count=None,
                sample_size=None,
                term_freq=False,
                line_level=False):
    """
  @param items a list of (domain, language, path) tuples
  """
    global b_dirs, complete

    # Our exitfunc uses this to know whether to delete the tokenized files
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        os.path.join(outdir, "bucket{0}".format(i)) for i in range(buckets)
    ]

    for d in b_dirs:
        os.mkdir(d)

    # PASS 1: Tokenize documents into sets of terms

    # If there are few items, make the chunk size such that each job
    # will have 2 chunks
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size,
                             term_freq, line_level)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)

        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
        print "job count: {0}".format(jobs)

        if sample_count:
            print "sampling-based tokenization: size {0} count {1}".format(
                sample_size, sample_count)
        else:
            print "whole-document tokenization"

        for i, keycount in enumerate(pass_tokenize_out):
            print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count,
                                                         keycount)

    complete = True

    return b_dirs
Beispiel #2
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE,
                sample_count=None,
                sample_size=None,
                term_freq=False,
                line_level=False):
    global b_dirs, complete

    #判断是否删除标记的文件
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        os.path.join(outdir, "bucket{0}".format(i)) for i in range(buckets)
    ]

    for d in b_dirs:
        os.mkdir(d)

    # PASS 1: 将文档分为几组
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size,
                             term_freq, line_level)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)
        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
        print "job count: {0}".format(jobs)
        if sample_count:
            print "sampling-based tokenization: size {0} count {1}".format(
                sample_size, sample_count)
        else:
            print "whole-document tokenization"

        for i, keycount in enumerate(pass_tokenize_out):
            print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count,
                                                         keycount)

    complete = True

    return b_dirs
Beispiel #3
0
def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
    global b_dirs
    num_instances = len(paths)
    num_features = max(i for v in tk_output.values() for i in v) + 1

    # Generate the feature map
    nm_arr = mp.Array('i', tk_nextmove, lock=False)

    if args.jobs:
        chunksize = min(len(paths) / (args.jobs * 2), args.chunksize)
    else:
        chunksize = min(len(paths) / (mp.cpu_count() * 2), args.chunksize)

    # TODO: Set the output dir
    b_dirs = [
        tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path)
        for i in range(args.buckets)
    ]

    output_states = set(tk_output)

    path_chunks = list(chunk(paths, chunksize))
    pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)

    pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs)
    with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
        pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

    write_count = sum(pass_tokenize_out)
    if not SILENT:
        print "wrote a total of %d keys" % write_count

    pass_ptc_params = (cm, num_instances)
    with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
        pass_ptc_out = f(pass_ptc, b_dirs)

    reads, ids, prods = zip(*pass_ptc_out)
    read_count = sum(reads)
    if not SILENT:
        print "read a total of %d keys (%d short)" % (read_count,
                                                      write_count - read_count)

    prod = np.zeros((num_features, cm.shape[1]), dtype=int)
    prod[np.concatenate(ids)] = np.vstack(prods)
    ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

    nb_ptc = array.array('d')
    for term_dist in ptc.tolist():
        nb_ptc.extend(term_dist)
    return nb_ptc
Beispiel #4
0
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None, term_freq=False):
  """
  @param items a list of (domain, language, path) tuples
  """
  global b_dirs, complete

  # Our exitfunc uses this to know whether to delete the tokenized files
  complete = False 

  if jobs is None:
    jobs = mp.cpu_count() + 4

  b_dirs = [ os.path.join(outdir,"bucket{0}".format(i)) for i in range(buckets) ]

  for d in b_dirs:
    os.mkdir(d)

  # PASS 1: Tokenize documents into sets of terms
   
  # If there are few items, make the chunk size such that each job
  # will have 2 chunks
  chunk_size = max(1,min(len(items) / (jobs * 2), chunksize))
  item_chunks = list(chunk(items, chunk_size))
  pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size, term_freq)

  with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
    pass_tokenize_out = f(pass_tokenize, item_chunks)


    doc_count = defaultdict(int)
    chunk_count = len(item_chunks)
    if not SILENT:
      print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
      print "job count: {0}".format(jobs)

    if sample_count:
      if not SILENT:
        print "sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count)
    else:
      if not SILENT:
        print "whole-document tokenization"

    for i, keycount in enumerate(pass_tokenize_out):
      if not SILENT:
        print "tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount)

  complete = True

  return b_dirs
Beispiel #5
0
def learn_ptc(paths, tk_nextmove, tk_output, cm, temp_path, args):
  global b_dirs
  num_instances = len(paths)
  num_features = max( i for v in tk_output.values() for i in v) + 1

  # Generate the feature map
  nm_arr = mp.Array('i', tk_nextmove, lock=False)

  if args.jobs:
    chunksize = min(len(paths) / (args.jobs*2), args.chunksize)
  else:
    chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize)

  # TODO: Set the output dir
  b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ]

  output_states = set(tk_output)
  
  path_chunks = list(chunk(paths, chunksize))
  pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)
  
  pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs) 
  with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
    pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

  write_count = sum(pass_tokenize_out)
  if not SILENT:
    print "wrote a total of %d keys" % write_count

  pass_ptc_params = (cm, num_instances)
  with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
    pass_ptc_out = f(pass_ptc, b_dirs)

  reads, ids, prods = zip(*pass_ptc_out)
  read_count = sum(reads)
  if not SILENT:
    print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)

  prod = np.zeros((num_features, cm.shape[1]), dtype=int)
  prod[np.concatenate(ids)] = np.vstack(prods)
  ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

  nb_ptc = array.array('d')
  for term_dist in ptc.tolist():
    nb_ptc.extend(term_dist)
  return nb_ptc
Beispiel #6
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE):
    """
  @param items a list of (domain, language, path) tuples
  """
    global b_dirs, complete

    # Our exitfunc uses this to know whether to delete the tokenized files
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        tempfile.mkdtemp(prefix="tokenize-",
                         suffix='-{0}'.format(tokenizer.__class__.__name__),
                         dir=outdir) for i in range(buckets)
    ]

    # PASS 1: Tokenize documents into sets of terms

    # If there are few items, make the chunk size such that each job
    # will have 2 chunks
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)

        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
        print "job count: {0}".format(jobs)

        for i, keycount in enumerate(pass_tokenize_out):
            print "tokenized chunk (%d/%d) [%d keys]" % (i + 1, chunk_count,
                                                         keycount)

    complete = True

    return b_dirs
Beispiel #7
0
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE, sample_count=None, sample_size=None):
  """
  @param items a list of (domain, language, path) tuples
  """
  global b_dirs, complete

  # Our exitfunc uses this to know whether to delete the tokenized files
  complete = False 

  if jobs is None:
    jobs = mp.cpu_count() + 4

  b_dirs = [ tempfile.mkdtemp(prefix="tokenize-",suffix='-{0}'.format(tokenizer.__class__.__name__), dir=outdir) for i in range(buckets) ]

  # PASS 1: Tokenize documents into sets of terms
   
  # If there are few items, make the chunk size such that each job
  # will have 2 chunks
  chunk_size = max(1,min(len(items) / (jobs * 2), chunksize))
  item_chunks = list(chunk(items, chunk_size))
  pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size)

  with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
    pass_tokenize_out = f(pass_tokenize, item_chunks)


    doc_count = defaultdict(int)
    chunk_count = len(item_chunks)
    logger.info("chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count))
    logger.info("job count: {0}".format(jobs))

    if sample_count:
      logger.info("sampling-based tokenization: size {0} count {1}".format(sample_size, sample_count))
    else:
      logger.info("whole-document tokenization")

    total_bytes = 0
    for i, chunk_bytes in enumerate(pass_tokenize_out):
      logger.debug("tokenized chunk (%d/%d) [%d bytes]" % (i+1,chunk_count, chunk_bytes))
      total_bytes += chunk_bytes

  logger.info("tokenized a total of {0} MB".format(total_bytes / 1024 / 1024))

  complete = True

  return b_dirs
Beispiel #8
0
def learn_ftc(paths, tk_nextmove, tk_output, cm, temp_path, args):
    global b_dirs
    num_instances = len(paths)
    num_features = max(i for v in tk_output.values() for i in v) + 1

    # Generate the feature map
    nm_arr = mp.Array('i', tk_nextmove, lock=False)

    if args.jobs:
        chunksize = min(len(paths) / (args.jobs * 2), args.chunksize)
    else:
        chunksize = min(len(paths) / (mp.cpu_count() * 2), args.chunksize)

    # TODO: Set the output dir
    b_dirs = [
        tempfile.mkdtemp(prefix="train-", suffix='-bucket', dir=temp_path)
        for i in range(args.buckets)
    ]

    output_states = set(tk_output)

    path_chunks = list(chunk(paths, chunksize))
    pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)

    pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs)
    with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
        pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

    write_count = sum(pass_tokenize_out)
    logger.info("wrote a total of %d keys", write_count)

    # TODO: Report on the progress of this pass
    pass_ftc_params = (cm, num_instances)
    with MapPool(args.jobs, setup_pass_ftc, pass_ftc_params) as f:
        pass_ftc_out = f(pass_ftc, b_dirs)

    reads, ids, prods = zip(*pass_ftc_out)
    read_count = sum(reads)
    logger.info("read a total of %d keys (%d short)", read_count,
                write_count - read_count)

    # Re-order the weights into a single ndarray
    term_lang_counts = np.zeros((num_features, cm.shape[1]), dtype=int)
    term_lang_counts[np.concatenate(ids)] = np.vstack(prods)
    return term_lang_counts
Beispiel #9
0
def learn_ftc(paths, tk_nextmove, tk_output, cm, temp_path, args):
  global b_dirs
  num_instances = len(paths)
  num_features = max( i for v in tk_output.values() for i in v) + 1

  # Generate the feature map
  nm_arr = mp.Array('i', tk_nextmove, lock=False)

  if args.jobs:
    chunksize = min(len(paths) / (args.jobs*2), args.chunksize)
  else:
    chunksize = min(len(paths) / (mp.cpu_count()*2), args.chunksize)

  # TODO: Set the output dir
  b_dirs = [ tempfile.mkdtemp(prefix="train-",suffix='-bucket', dir=temp_path) for i in range(args.buckets) ]

  output_states = set(tk_output)
  
  path_chunks = list(chunk(paths, chunksize))
  pass_tokenize_arg = zip(offsets(path_chunks), path_chunks)
  
  pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs) 
  with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
    pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

  write_count = sum(pass_tokenize_out)
  logger.info("wrote a total of %d keys", write_count)

  # TODO: Report on the progress of this pass
  pass_ftc_params = (cm, num_instances)
  with MapPool(args.jobs, setup_pass_ftc, pass_ftc_params) as f:
    pass_ftc_out = f(pass_ftc, b_dirs)

  reads, ids, prods = zip(*pass_ftc_out)
  read_count = sum(reads)
  logger.info("read a total of %d keys (%d short)", read_count, write_count - read_count)

  # Re-order the weights into a single ndarray
  term_lang_counts = np.zeros((num_features, cm.shape[1]), dtype=int)
  term_lang_counts[np.concatenate(ids)] = np.vstack(prods)
  return term_lang_counts
Beispiel #10
0
def build_index(items, tokenizer, outdir, buckets=NUM_BUCKETS, jobs=None, chunksize=CHUNKSIZE):
  """
  @param items a list of (domain, language, path) tuples
  """
  global b_dirs, complete

  # Our exitfunc uses this to know whether to delete the tokenized files
  complete = False 

  if jobs is None:
    jobs = mp.cpu_count() + 4

  b_dirs = [ tempfile.mkdtemp(prefix="tokenize-",suffix='-{0}'.format(tokenizer.__class__.__name__), dir=outdir) for i in range(buckets) ]

  # PASS 1: Tokenize documents into sets of terms
   
  # If there are few items, make the chunk size such that each job
  # will have 2 chunks
  chunk_size = max(1,min(len(items) / (jobs * 2), chunksize))
  item_chunks = list(chunk(items, chunk_size))
  pass_tokenize_globals = (tokenizer, b_dirs)

  with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
    pass_tokenize_out = f(pass_tokenize, item_chunks)


    doc_count = defaultdict(int)
    chunk_count = len(item_chunks)
    print "chunk size: {0} ({1} chunks)".format(chunk_size, chunk_count)
    print "job count: {0}".format(jobs)

    for i, keycount in enumerate(pass_tokenize_out):
      print "tokenized chunk (%d/%d) [%d keys]" % (i+1,chunk_count, keycount)

  complete = True

  return b_dirs
Beispiel #11
0
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
  """
  @param items label, path pairs
  """
  global outdir

  # Generate the feature map
  nm_arr = mp.Array('i', tk_nextmove, lock=False)

  if args.jobs:
    tasks = args.jobs * 2
  else:
    tasks = mp.cpu_count() * 2

  # Ensure chunksize of at least 1, but not exceeding specified chunksize
  chunksize = max(1, min(len(items) / tasks, args.chunksize))

  outdir = tempfile.mkdtemp(prefix="NBtrain-",suffix='-buckets', dir=temp_path)
  b_dirs = [ os.path.join(outdir,"bucket{0}".format(i)) for i in range(args.buckets) ]

  for d in b_dirs:
    os.mkdir(d)

  output_states = set(tk_output)
  
  # Divide all the items to be processed into chunks, and enumerate each chunk.
  item_chunks = list(chunk(items, chunksize))
  pass_tokenize_arg = enumerate(item_chunks)
  
  pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs, args.line) 
  with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
    pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

  write_count = 0
  chunk_sizes = {}
  labels = []
  for chunk_id, doc_count, writes, _labels in pass_tokenize_out:
    write_count += writes
    chunk_sizes[chunk_id] = doc_count
    labels.extend(_labels)

  print "wrote a total of %d keys" % write_count

  num_instances = sum(chunk_sizes.values())
  print "processed a total of %d instances" % num_instances

  chunk_offsets = {}
  for i in range(len(chunk_sizes)):
    chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i))
    print "  offset for chunk {0} is {1}".format(i, chunk_offsets[i])

  pass_fm_params = (num_instances, chunk_offsets)
  with MapPool(args.jobs, setup_pass_fm, pass_fm_params) as f:
    pass_fm_out = f(pass_fm, b_dirs)

  reads, ids, fms = zip(*pass_fm_out)
  read_count = sum(reads)
  print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)

  num_features = max( i for v in tk_output.values() for i in v) + 1
  fm = np.zeros((num_features, num_instances), dtype=int)
  fm[np.concatenate(ids)] = np.vstack(fms)

  print "have {} labels".format(len(labels))
  cm = np.zeros((num_instances, num_langs), dtype='bool')
  for doc_id, lang_id in enumerate(labels):
    cm[doc_id, lang_id] = True

  # This is where the smoothing occurs
  prod = np.dot(fm, cm)
  ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

  nb_ptc = array.array('d')
  for term_dist in ptc.tolist():
    nb_ptc.extend(term_dist)

  pc = np.log(cm.sum(0))
  nb_pc = array.array('d', pc)

  return nb_pc, nb_ptc
Beispiel #12
0
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
    """
  @param items label, path pairs
  """
    global outdir

    print "learning NB parameters on {} items".format(len(items))

    # Generate the feature map
    nm_arr = mp.Array('i', tk_nextmove, lock=False)

    if args.jobs:
        tasks = args.jobs * 2
    else:
        tasks = mp.cpu_count() * 2

    # Ensure chunksize of at least 1, but not exceeding specified chunksize
    chunksize = max(1, min(len(items) / tasks, args.chunksize))

    outdir = tempfile.mkdtemp(prefix="NBtrain-",
                              suffix='-buckets',
                              dir=temp_path)
    b_dirs = [
        os.path.join(outdir, "bucket{0}".format(i))
        for i in range(args.buckets)
    ]

    for d in b_dirs:
        os.mkdir(d)

    output_states = set(tk_output)

    # Divide all the items to be processed into chunks, and enumerate each chunk.
    item_chunks = list(chunk(items, chunksize))
    num_chunks = len(item_chunks)
    print "about to tokenize {} chunks".format(num_chunks)

    pass_tokenize_arg = enumerate(item_chunks)
    pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs,
                            args.line)
    with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
        pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)

        write_count = 0
        chunk_sizes = {}
        chunk_labels = []
        for i, (chunk_id, doc_count, writes,
                labels) in enumerate(pass_tokenize_out):
            write_count += writes
            chunk_sizes[chunk_id] = doc_count
            chunk_labels.append((chunk_id, labels))
            print "processed chunk ID:{0} ({1}/{2}) [{3} keys]".format(
                chunk_id, i + 1, num_chunks, writes)

    print "wrote a total of %d keys" % write_count

    num_instances = sum(chunk_sizes.values())
    print "processed a total of %d instances" % num_instances

    chunk_offsets = {}
    for i in range(len(chunk_sizes)):
        chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i))

    # Build CM based on re-ordeing chunk
    cm = np.zeros((num_instances, num_langs), dtype='bool')
    for chunk_id, chunk_label in chunk_labels:
        for doc_id, lang_id in enumerate(chunk_label):
            index = doc_id + chunk_offsets[chunk_id]
            cm[index, lang_id] = True

    pass_ptc_params = (cm, num_instances, chunk_offsets)
    with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
        pass_ptc_out = f(pass_ptc, b_dirs)

        def pass_ptc_progress():
            for i, v in enumerate(pass_ptc_out):
                yield v
                print "processed chunk ({0}/{1})".format(i + 1, len(b_dirs))

        reads, ids, prods = zip(*pass_ptc_progress())
        read_count = sum(reads)
        print "read a total of %d keys (%d short)" % (read_count,
                                                      write_count - read_count)

    num_features = max(i for v in tk_output.values() for i in v) + 1
    prod = np.zeros((num_features, cm.shape[1]), dtype=int)
    prod[np.concatenate(ids)] = np.vstack(prods)

    # This is where the smoothing occurs
    ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

    nb_ptc = array.array('d')
    for term_dist in ptc.tolist():
        nb_ptc.extend(term_dist)

    pc = np.log(cm.sum(0))
    nb_pc = array.array('d', pc)

    return nb_pc, nb_ptc
Beispiel #13
0
def build_index(items,
                tokenizer,
                outdir,
                buckets=NUM_BUCKETS,
                jobs=None,
                chunksize=CHUNKSIZE,
                sample_count=None,
                sample_size=None):
    """
  @param items a list of (domain, language, path) tuples
  """
    global b_dirs, complete

    # Our exitfunc uses this to know whether to delete the tokenized files
    complete = False

    if jobs is None:
        jobs = mp.cpu_count() + 4

    b_dirs = [
        tempfile.mkdtemp(prefix="tokenize-",
                         suffix='-{0}'.format(tokenizer.__class__.__name__),
                         dir=outdir) for i in range(buckets)
    ]

    # PASS 1: Tokenize documents into sets of terms

    # If there are few items, make the chunk size such that each job
    # will have 2 chunks
    chunk_size = max(1, min(len(items) / (jobs * 2), chunksize))
    item_chunks = list(chunk(items, chunk_size))
    pass_tokenize_globals = (tokenizer, b_dirs, sample_count, sample_size)

    with MapPool(jobs, setup_pass_tokenize, pass_tokenize_globals) as f:
        pass_tokenize_out = f(pass_tokenize, item_chunks)

        doc_count = defaultdict(int)
        chunk_count = len(item_chunks)
        logger.info("chunk size: {0} ({1} chunks)".format(
            chunk_size, chunk_count))
        logger.info("job count: {0}".format(jobs))

        if sample_count:
            logger.info(
                "sampling-based tokenization: size {0} count {1}".format(
                    sample_size, sample_count))
        else:
            logger.info("whole-document tokenization")

        total_bytes = 0
        for i, chunk_bytes in enumerate(pass_tokenize_out):
            logger.debug("tokenized chunk (%d/%d) [%d bytes]" %
                         (i + 1, chunk_count, chunk_bytes))
            total_bytes += chunk_bytes

    logger.info("tokenized a total of {0} MB".format(total_bytes / 1024 /
                                                     1024))

    complete = True

    return b_dirs
Beispiel #14
0
def learn_nb_params(items, num_langs, tk_nextmove, tk_output, temp_path, args):
  """
  @param items label, path pairs
  """
  global outdir

  print "learning NB parameters on {} items".format(len(items))

  # Generate the feature map
  nm_arr = mp.Array('i', tk_nextmove, lock=False)

  if args.jobs:
    tasks = args.jobs * 2
  else:
    tasks = mp.cpu_count() * 2

  # Ensure chunksize of at least 1, but not exceeding specified chunksize
  chunksize = max(1, min(len(items) / tasks, args.chunksize))

  outdir = tempfile.mkdtemp(prefix="NBtrain-",suffix='-buckets', dir=temp_path)
  b_dirs = [ os.path.join(outdir,"bucket{0}".format(i)) for i in range(args.buckets) ]

  for d in b_dirs:
    os.mkdir(d)

  output_states = set(tk_output)
  
  # Divide all the items to be processed into chunks, and enumerate each chunk.
  item_chunks = list(chunk(items, chunksize))
  num_chunks = len(item_chunks)
  print "about to tokenize {} chunks".format(num_chunks)
  
  pass_tokenize_arg = enumerate(item_chunks)
  pass_tokenize_params = (nm_arr, output_states, tk_output, b_dirs, args.line) 
  with MapPool(args.jobs, setup_pass_tokenize, pass_tokenize_params) as f:
    pass_tokenize_out = f(pass_tokenize, pass_tokenize_arg)
  
    write_count = 0
    chunk_sizes = {}
    chunk_labels = []
    for i, (chunk_id, doc_count, writes, labels) in enumerate(pass_tokenize_out):
      write_count += writes
      chunk_sizes[chunk_id] = doc_count
      chunk_labels.append((chunk_id, labels))
      print "processed chunk ID:{0} ({1}/{2}) [{3} keys]".format(chunk_id, i+1, num_chunks, writes)

  print "wrote a total of %d keys" % write_count

  num_instances = sum(chunk_sizes.values())
  print "processed a total of %d instances" % num_instances

  chunk_offsets = {}
  for i in range(len(chunk_sizes)):
    chunk_offsets[i] = sum(chunk_sizes[x] for x in range(i))

  # Build CM based on re-ordeing chunk
  cm = np.zeros((num_instances, num_langs), dtype='bool')
  for chunk_id, chunk_label in chunk_labels:
    for doc_id, lang_id in enumerate(chunk_label):
      index = doc_id + chunk_offsets[chunk_id]
      cm[index, lang_id] = True

  pass_ptc_params = (cm, num_instances, chunk_offsets)
  with MapPool(args.jobs, setup_pass_ptc, pass_ptc_params) as f:
    pass_ptc_out = f(pass_ptc, b_dirs)

    def pass_ptc_progress():
      for i,v in enumerate(pass_ptc_out):
        yield v
        print "processed chunk ({0}/{1})".format(i+1, len(b_dirs))

    reads, ids, prods = zip(*pass_ptc_progress())
    read_count = sum(reads)
    print "read a total of %d keys (%d short)" % (read_count, write_count - read_count)

  num_features = max( i for v in tk_output.values() for i in v) + 1
  prod = np.zeros((num_features, cm.shape[1]), dtype=int)
  prod[np.concatenate(ids)] = np.vstack(prods)

  # This is where the smoothing occurs
  ptc = np.log(1 + prod) - np.log(num_features + prod.sum(0))

  nb_ptc = array.array('d')
  for term_dist in ptc.tolist():
    nb_ptc.extend(term_dist)

  pc = np.log(cm.sum(0))
  nb_pc = array.array('d', pc)

  return nb_pc, nb_ptc