コード例 #1
0
ファイル: train.py プロジェクト: jiuyue99207/lan_id
    # 计算IG权重
    ig_params = [
        ('lang', lang_dist_vec, '.lang', True),
    ]
    print("come here 111")
    ig_params.append(('domain', domain_dist_vec, '.domain', False))

    ig_vals = {}
    for label, dist, suffix, binarize in ig_params:
        print "Computing information gain for {0}".format(label)
        ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, None)
        ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig)

    # 根据LD选择特征
    features_per_lang = select_LD_features(ig_vals['lang'],
                                           ig_vals.get('domain'),
                                           300,
                                           ignore_domain=canshu.no_domain_ig)
    LDfeats = reduce(set.union, map(set, features_per_lang.values()))
    print 'selected %d features' % len(LDfeats)

    # 编译LD特征的扫描值
    tk_nextmove, tk_output = build_scanner(LDfeats)

    # 组合成NB模型
    langs = sorted(lang_index, key=lang_index.get)

    nb_classes = langs
    nb_dir = os.path.join(buckets_dir, 'NBtrain')
    makedir(nb_dir)
    nb_pc, nb_ptc = learn_nb_params([(int(l), p) for _, l, p in items],
                                    len(langs), tk_nextmove, tk_output, nb_dir,
コード例 #2
0
ファイル: train.py プロジェクト: EricSchles/topic-clustering
      ('lang', lang_dist_vec, '.lang', True),
    ]
    if not args.no_domain_ig:
      ig_params.append( ('domain', domain_dist_vec, '.domain', False) )

    ig_vals = {}
    for label, dist, suffix, binarize in ig_params:
      print "Computing information gain for {0}".format(label)
      ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs)
      if args.debug:
        weights_path = os.path.join(model_dir, 'IGweights' + suffix + ('.bin' if binarize else ''))
        write_weights(ig, weights_path)
      ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig)

    # Select features according to the LD criteria
    features_per_lang = select_LD_features(ig_vals['lang'], ig_vals['domain'], args.feats_per_lang, ignore_domain = args.no_domain_ig)
    LDfeats = reduce(set.union, map(set, features_per_lang.values()))
    print 'selected %d features' % len(LDfeats)

    if args.debug:
      feature_path = os.path.join(model_dir, 'LDfeats')
      write_features(sorted(LDfeats), feature_path)
      print 'wrote LD features to "%s"' % feature_path 

      with open(feature_path + '.perlang', 'w') as f:
        writer = csv.writer(f)
        for i in range(len(features_per_lang)):
          writer.writerow(map(repr,features_per_lang[i]))
      print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang'

  # Compile a scanner for the LDfeats
コード例 #3
0
ファイル: train.py プロジェクト: aitzol/langid.py
        ig_params = [("lang", lang_dist_vec, ".lang", True)]
        if not args.no_domain_ig:
            ig_params.append(("domain", domain_dist_vec, ".domain", False))

        ig_vals = {}
        for label, dist, suffix, binarize in ig_params:
            print "Computing information gain for {0}".format(label)
            ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs)
            if args.debug:
                weights_path = os.path.join(model_dir, "IGweights" + suffix + (".bin" if binarize else ""))
                write_weights(ig, weights_path)
            ig_vals[label] = dict((row[0], numpy.array(row[1].flat)) for row in ig)

        # Select features according to the LD criteria
        features_per_lang = select_LD_features(
            ig_vals["lang"], ig_vals.get("domain"), args.feats_per_lang, ignore_domain=args.no_domain_ig
        )
        LDfeats = reduce(set.union, map(set, features_per_lang.values()))
        print "selected %d features" % len(LDfeats)

        if args.debug:
            feature_path = os.path.join(model_dir, "LDfeats")
            write_features(sorted(LDfeats), feature_path)
            print 'wrote LD features to "%s"' % feature_path

            with open(feature_path + ".perlang", "w") as f:
                writer = csv.writer(f)
                for i in range(len(features_per_lang)):
                    writer.writerow(map(repr, features_per_lang[i]))
            print 'wrote LD.perlang features to "%s"' % feature_path + ".perlang"
コード例 #4
0
        ig_vals = {}
        for label, dist, suffix, binarize in ig_params:
            print "Computing information gain for {0}".format(label)
            ig = compute_IG(b_dirs, DFfeats, dist, binarize, suffix, args.jobs)
            if args.debug:
                weights_path = os.path.join(
                    model_dir,
                    'IGweights' + suffix + ('.bin' if binarize else ''))
                write_weights(ig, weights_path)
            ig_vals[label] = dict(
                (row[0], numpy.array(row[1].flat)) for row in ig)

        # Select features according to the LD criteria
        features_per_lang = select_LD_features(ig_vals['lang'],
                                               ig_vals['domain'],
                                               args.feats_per_lang,
                                               ignore_domain=args.no_domain_ig)
        LDfeats = reduce(set.union, map(set, features_per_lang.values()))
        print 'selected %d features' % len(LDfeats)

        if args.debug:
            feature_path = os.path.join(model_dir, 'LDfeats')
            write_features(sorted(LDfeats), feature_path)
            print 'wrote LD features to "%s"' % feature_path

            with open(feature_path + '.perlang', 'w') as f:
                writer = csv.writer(f)
                for i in range(len(features_per_lang)):
                    writer.writerow(map(repr, features_per_lang[i]))
            print 'wrote LD.perlang features to "%s"' % feature_path + '.perlang'