コード例 #1
0
def _run(args):
    tf_stats = tf_idf.TFStats()
    idf_stats = tf_idf.IDFStats()

    if args.input_idf_stats is not None:
        idf_stats.read(args.input_idf_stats)

    num_done = 0
    for line in args.docs:
        parts = line.strip().split()
        doc = parts[0]
        tf_stats.accumulate(doc, parts[1:], args.ngram_order)

        if not args.accumulate_over_docs:
            # Write the document-id and the corresponding tf-idf values.
            print(doc, file=args.tf_idf_file, end=" ")
            tf_idf.write_tfidf_from_stats(
                tf_stats,
                idf_stats,
                args.tf_idf_file,
                tf_weighting_scheme=args.tf_weighting_scheme,
                idf_weighting_scheme=args.idf_weighting_scheme,
                tf_normalization_factor=args.tf_normalization_factor,
                expected_document_id=doc,
            )
            tf_stats = tf_idf.TFStats()
        num_done += 1

    if args.accumulate_over_docs:
        tf_stats.compute_term_stats(
            idf_stats=idf_stats if args.input_idf_stats is None else None)

        if args.output_idf_stats is not None:
            idf_stats.write(args.output_idf_stats)
            args.output_idf_stats.close()

        tf_idf.write_tfidf_from_stats(
            tf_stats,
            idf_stats,
            args.tf_idf_file,
            tf_weighting_scheme=args.tf_weighting_scheme,
            idf_weighting_scheme=args.idf_weighting_scheme,
            tf_normalization_factor=args.tf_normalization_factor,
        )

    if num_done == 0:
        raise RuntimeError("Could not compute TF-IDF for any query documents")
コード例 #2
0
ファイル: compute_tf_idf.py プロジェクト: LvHang/kaldi
def _run(args):
    tf_stats = tf_idf.TFStats()
    idf_stats = tf_idf.IDFStats()

    if args.input_idf_stats is not None:
        idf_stats.read(args.input_idf_stats)

    num_done = 0
    for line in args.docs:
        parts = line.strip().split()
        doc = parts[0]
        tf_stats.accumulate(doc, parts[1:], args.ngram_order)

        if not args.accumulate_over_docs:
            # Write the document-id and the corresponding tf-idf values.
            print (doc, file=args.tf_idf_file, end=' ')
            tf_idf.write_tfidf_from_stats(
                tf_stats, idf_stats, args.tf_idf_file,
                tf_weighting_scheme=args.tf_weighting_scheme,
                idf_weighting_scheme=args.idf_weighting_scheme,
                tf_normalization_factor=args.tf_normalization_factor,
                expected_document_id=doc)
            tf_stats = tf_idf.TFStats()
        num_done += 1

    if args.accumulate_over_docs:
        tf_stats.compute_term_stats(idf_stats=idf_stats
                                              if args.input_idf_stats is None
                                              else None)

        if args.output_idf_stats is not None:
            idf_stats.write(args.output_idf_stats)
            args.output_idf_stats.close()

        tf_idf.write_tfidf_from_stats(
            tf_stats, idf_stats, args.tf_idf_file,
            tf_weighting_scheme=args.tf_weighting_scheme,
            idf_weighting_scheme=args.idf_weighting_scheme,
            tf_normalization_factor=args.tf_normalization_factor)

    if num_done == 0:
        raise RuntimeError("Could not compute TF-IDF for any query documents")