Exemple #1
0
    # print("running here 0")
    #给bucket构造路径
    buckets_dir = os.path.join(model_dir, 'buckets')
    makedir(buckets_dir)
    print "建成完毕."
    #计算得到特征
    # Tokenize
    DFfeats = None
    print "will tokenize %d documents" % len(items)
    print "using byte NGram tokenizer, max_order: {0}".format(4)
    tk = NGramTokenizer(1, 4)

    # 首次通过标记化,用于确定特征的DF
    tk_dir = os.path.join(buckets_dir, 'tokenize-pass1')
    makedir(tk_dir)
    b_dirs = build_index(items, tk, tk_dir, 64, None, 50, None, 140, False)
    print("running here 0")
    doc_count = tally(b_dirs, None)
    DFfeats = ngram_select(doc_count, 4, 15000)
    shutil.rmtree(tk_dir)

    # 再次仅为所选的特征计数
    DF_scanner = Scanner(DFfeats)
    df_dir = os.path.join(buckets_dir, 'tokenize-pass2')
    makedir(df_dir)
    b_dirs = build_index(items, DF_scanner, df_dir, 64, None, 50)
    b_dirs = [[d] for d in b_dirs]

    # 计算向量值
    domain_dist_vec = numpy.array([
        domain_dist[domain_index[d]]
Exemple #2
0
        if args.df_feats:
            print "reading custom features from:", args.df_feats
            DFfeats = read_features(args.df_feats)
            print "building tokenizer for custom list of {0} features".format(
                len(DFfeats))
            tk = Scanner(DFfeats)
        elif args.word:
            print "using word tokenizer"
            tk = str.split
        else:
            print "using byte NGram tokenizer, max_order: {0}".format(
                args.max_order)
            tk = NGramTokenizer(1, args.max_order)

        # First-pass tokenization, used to determine DF of features
        b_dirs = build_index(items, tk, buckets_dir, args.buckets, args.jobs,
                             args.chunksize)

        if args.debug:
            # output the paths to the buckets
            with open(bucketlist_path, 'w') as f:
                for d in b_dirs:
                    f.write(d + '\n')

        # We need to compute a tally if we are selecting features by DF, but also if
        # we want full debug output.
        if DFfeats is None or args.debug:
            # Compute DF per-feature
            doc_count = tally(b_dirs, args.jobs)
            if args.debug:
                doc_count_path = os.path.join(model_dir, 'DF_all')
                write_weights(doc_count, doc_count_path)
Exemple #3
0
        # TODO: Custom tokenizer if doing custom first-pass features
        if args.df_feats:
            print "reading custom features from:", args.df_feats
            DFfeats = read_features(args.df_feats)
            print "building tokenizer for custom list of {0} features".format(len(DFfeats))
            tk = Scanner(DFfeats)
        elif args.word:
            print "using word tokenizer"
            tk = str.split
        else:
            print "using byte NGram tokenizer, max_order: {0}".format(args.max_order)
            tk = NGramTokenizer(1, args.max_order)

        # First-pass tokenization, used to determine DF of features
        b_dirs = build_index(
            items, tk, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size
        )

        if args.debug:
            # output the paths to the buckets
            with open(bucketlist_path, "w") as f:
                for d in b_dirs:
                    f.write(d + "\n")

        # We need to compute a tally if we are selecting features by DF, but also if
        # we want full debug output.
        if DFfeats is None or args.debug:
            # Compute DF per-feature
            doc_count = tally(b_dirs, args.jobs)
            if args.debug:
                doc_count_path = os.path.join(model_dir, "DF_all")
Exemple #4
0
    if args.df_feats:
      print "reading custom features from:", args.df_feats
      DFfeats = read_features(args.df_feats)
      print "building tokenizer for custom list of {0} features".format(len(DFfeats))
      tk = Scanner(DFfeats)
    elif args.word:
      print "using word tokenizer"
      tk = str.split
    else:
      print "using byte NGram tokenizer, max_order: {0}".format(args.max_order)
      tk = NGramTokenizer(1, args.max_order)
    
    # First-pass tokenization, used to determine DF of features
    tk_dir = os.path.join(buckets_dir, 'tokenize-pass1')
    makedir(tk_dir)
    b_dirs = build_index(items, tk, tk_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size, args.line)

    if args.debug:
      # output the paths to the buckets
      bucketlist_path = os.path.join(model_dir, 'bucketlist')
      with open(bucketlist_path,'w') as f:
        for d in b_dirs:
          f.write(d+'\n')

    # We need to compute a tally if we are selecting features by DF, but also if
    # we want full debug output.
    if DFfeats is None or args.debug:
      # Compute DF per-feature
      doc_count = tally(b_dirs, args.jobs)
      if args.debug:
        doc_count_path = os.path.join(model_dir, 'DF_all')