def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [ wordlist_filename(source, language, 'counts.txt') for source in sources ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py', params={'cutoff': 2, 'lang': language}) output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz' ) output_cBpack_big = wordlist_filename( 'combined-dist-large', language, 'msgpack.gz' ) add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language, 'buckets': 600}) add_dep(lines, 'freqs2cB', output_file, output_cBpack_big, extra='wordfreq_builder/word_counts.py', params={'lang': language, 'buckets': 800}) lines.append('default {}'.format(output_cBpack)) if language in CONFIG['big-lists']: lines.append('default {}'.format(output_cBpack_big)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') output_cBpack = wordlist_filename( 'twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language, 'buckets': 600}) lines.append('default {}'.format(output_cBpack)) # Write a Jieba-compatible frequency file for Chinese tokenization chinese_combined = wordlist_filename('combined', 'zh') jieba_output = wordlist_filename('jieba-dist', 'zh') add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output, extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py']) lines.append('default {}'.format(jieba_output)) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [ wordlist_filename(source, language, 'counts.txt') for source in sources ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py') output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language}) lines.append('default {}'.format(output_cBpack)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={'lang': language}) lines.append('default {}'.format(output_cBpack)) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [wordlist_filename(source, language, "counts.txt") for source in sources] output_file = wordlist_filename("combined", language) add_dep(lines, "merge", input_files, output_file, extra="wordfreq_builder/word_counts.py") output_cBpack = wordlist_filename("combined-dist", language, "msgpack.gz") add_dep( lines, "freqs2cB", output_file, output_cBpack, extra="wordfreq_builder/word_counts.py", params={"lang": language}, ) lines.append("default {}".format(output_cBpack)) # Write standalone lists for Twitter frequency if language in CONFIG["sources"]["twitter"]: input_file = wordlist_filename("twitter", language, "counts.txt") output_cBpack = wordlist_filename("twitter-dist", language, "msgpack.gz") add_dep( lines, "freqs2cB", input_file, output_cBpack, extra="wordfreq_builder/word_counts.py", params={"lang": language}, ) lines.append("default {}".format(output_cBpack)) return lines
def combine_lists(languages): lines = [] for language in languages: sources = source_names(language) input_files = [ wordlist_filename(source, language, 'counts.txt') for source in sources ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py', params={ 'cutoff': 2, 'lang': language }) output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') output_cBpack_big = wordlist_filename('combined-dist-large', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', output_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={ 'lang': language, 'buckets': 600 }) add_dep(lines, 'freqs2cB', output_file, output_cBpack_big, extra='wordfreq_builder/word_counts.py', params={ 'lang': language, 'buckets': 800 }) lines.append('default {}'.format(output_cBpack)) if language in CONFIG['big-lists']: lines.append('default {}'.format(output_cBpack_big)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, extra='wordfreq_builder/word_counts.py', params={ 'lang': language, 'buckets': 600 }) lines.append('default {}'.format(output_cBpack)) # Write a Jieba-compatible frequency file for Chinese tokenization chinese_combined = wordlist_filename('combined', 'zh') jieba_output = wordlist_filename('jieba-dist', 'zh') add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output, extra=[ 'wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py' ]) lines.append('default {}'.format(jieba_output)) return lines