Ejemplo n.º 1
0
def combine_lists(languages):
    lines = []
    for language in languages:
        sources = source_names(language)
        input_files = [
            wordlist_filename(source, language, 'counts.txt')
            for source in sources
        ]
        output_file = wordlist_filename('combined', language)
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py',
                params={'cutoff': 2, 'lang': language})

        output_cBpack = wordlist_filename(
            'combined-dist', language, 'msgpack.gz'
        )
        output_cBpack_big = wordlist_filename(
            'combined-dist-large', language, 'msgpack.gz'
        )
        add_dep(lines, 'freqs2cB', output_file, output_cBpack,
                extra='wordfreq_builder/word_counts.py',
                params={'lang': language, 'buckets': 600})
        add_dep(lines, 'freqs2cB', output_file, output_cBpack_big,
                extra='wordfreq_builder/word_counts.py',
                params={'lang': language, 'buckets': 800})

        lines.append('default {}'.format(output_cBpack))
        if language in CONFIG['big-lists']:
            lines.append('default {}'.format(output_cBpack_big))

        # Write standalone lists for Twitter frequency
        if language in CONFIG['sources']['twitter']:
            input_file = wordlist_filename('twitter', language, 'counts.txt')
            output_cBpack = wordlist_filename(
                'twitter-dist', language, 'msgpack.gz')
            add_dep(lines, 'freqs2cB', input_file, output_cBpack,
                    extra='wordfreq_builder/word_counts.py',
                    params={'lang': language, 'buckets': 600})

            lines.append('default {}'.format(output_cBpack))

    # Write a Jieba-compatible frequency file for Chinese tokenization
    chinese_combined = wordlist_filename('combined', 'zh')
    jieba_output = wordlist_filename('jieba-dist', 'zh')
    add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output,
            extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py'])
    lines.append('default {}'.format(jieba_output))
    return lines
Ejemplo n.º 2
0
def combine_lists(languages):
    lines = []
    for language in languages:
        sources = source_names(language)
        input_files = [
            wordlist_filename(source, language, 'counts.txt')
            for source in sources
        ]
        output_file = wordlist_filename('combined', language)
        add_dep(lines,
                'merge',
                input_files,
                output_file,
                extra='wordfreq_builder/word_counts.py')

        output_cBpack = wordlist_filename('combined-dist', language,
                                          'msgpack.gz')
        add_dep(lines,
                'freqs2cB',
                output_file,
                output_cBpack,
                extra='wordfreq_builder/word_counts.py',
                params={'lang': language})

        lines.append('default {}'.format(output_cBpack))

        # Write standalone lists for Twitter frequency
        if language in CONFIG['sources']['twitter']:
            input_file = wordlist_filename('twitter', language, 'counts.txt')
            output_cBpack = wordlist_filename('twitter-dist', language,
                                              'msgpack.gz')
            add_dep(lines,
                    'freqs2cB',
                    input_file,
                    output_cBpack,
                    extra='wordfreq_builder/word_counts.py',
                    params={'lang': language})

            lines.append('default {}'.format(output_cBpack))

    return lines
Ejemplo n.º 3
0
def combine_lists(languages):
    lines = []
    for language in languages:
        sources = source_names(language)
        input_files = [wordlist_filename(source, language, "counts.txt") for source in sources]
        output_file = wordlist_filename("combined", language)
        add_dep(lines, "merge", input_files, output_file, extra="wordfreq_builder/word_counts.py")

        output_cBpack = wordlist_filename("combined-dist", language, "msgpack.gz")
        add_dep(
            lines,
            "freqs2cB",
            output_file,
            output_cBpack,
            extra="wordfreq_builder/word_counts.py",
            params={"lang": language},
        )

        lines.append("default {}".format(output_cBpack))

        # Write standalone lists for Twitter frequency
        if language in CONFIG["sources"]["twitter"]:
            input_file = wordlist_filename("twitter", language, "counts.txt")
            output_cBpack = wordlist_filename("twitter-dist", language, "msgpack.gz")
            add_dep(
                lines,
                "freqs2cB",
                input_file,
                output_cBpack,
                extra="wordfreq_builder/word_counts.py",
                params={"lang": language},
            )

            lines.append("default {}".format(output_cBpack))

    return lines
Ejemplo n.º 4
0
def combine_lists(languages):
    lines = []
    for language in languages:
        sources = source_names(language)
        input_files = [
            wordlist_filename(source, language, 'counts.txt')
            for source in sources
        ]
        output_file = wordlist_filename('combined', language)
        add_dep(lines,
                'merge',
                input_files,
                output_file,
                extra='wordfreq_builder/word_counts.py',
                params={
                    'cutoff': 2,
                    'lang': language
                })

        output_cBpack = wordlist_filename('combined-dist', language,
                                          'msgpack.gz')
        output_cBpack_big = wordlist_filename('combined-dist-large', language,
                                              'msgpack.gz')
        add_dep(lines,
                'freqs2cB',
                output_file,
                output_cBpack,
                extra='wordfreq_builder/word_counts.py',
                params={
                    'lang': language,
                    'buckets': 600
                })
        add_dep(lines,
                'freqs2cB',
                output_file,
                output_cBpack_big,
                extra='wordfreq_builder/word_counts.py',
                params={
                    'lang': language,
                    'buckets': 800
                })

        lines.append('default {}'.format(output_cBpack))
        if language in CONFIG['big-lists']:
            lines.append('default {}'.format(output_cBpack_big))

        # Write standalone lists for Twitter frequency
        if language in CONFIG['sources']['twitter']:
            input_file = wordlist_filename('twitter', language, 'counts.txt')
            output_cBpack = wordlist_filename('twitter-dist', language,
                                              'msgpack.gz')
            add_dep(lines,
                    'freqs2cB',
                    input_file,
                    output_cBpack,
                    extra='wordfreq_builder/word_counts.py',
                    params={
                        'lang': language,
                        'buckets': 600
                    })

            lines.append('default {}'.format(output_cBpack))

    # Write a Jieba-compatible frequency file for Chinese tokenization
    chinese_combined = wordlist_filename('combined', 'zh')
    jieba_output = wordlist_filename('jieba-dist', 'zh')
    add_dep(lines,
            'counts_to_jieba',
            chinese_combined,
            jieba_output,
            extra=[
                'wordfreq_builder/word_counts.py',
                'wordfreq_builder/cli/counts_to_jieba.py'
            ])
    lines.append('default {}'.format(jieba_output))
    return lines