def extract_translations(dump_file, target_lang, ref_file_list, output_to_console): """ """ stderr('Loading references...') doc_to_ref = load_references(ref_file_list) doc_to_src = load_sources(ref_file_list) doc_to_timing = defaultdict(dict) session_id = 0 stderr('Loading database dump...') doc_to_user_txt = defaultdict(dict) doc_to_user_time = defaultdict(dict) doc_to_user_valid = defaultdict(dict) dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang) username_set = set() for row in dump_row_list: username_set.add(row.username) text_dict = json.loads(row.text) segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict) doc_name = url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = segment_times_from_log(log) segment_to_mt = initial_translations_from_imt_log( log ) if row.interface == 'imt' else initial_translations_from_pe_log(log) for line_id in sorted(segment_to_tgt_txt.keys()): doc_id = '%s:%d' % (doc_name, line_id) user_id = row.username + ':' + row.interface mt_id = 'MT:mt' doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id] doc_to_user_time[doc_id][user_id] = segment_to_time[line_id] doc_to_user_valid[doc_id][user_id] = str2bool(row.valid) if line_id in segment_to_mt: doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id] doc_to_user_time[doc_id][mt_id] = 0.0 doc_to_user_valid[doc_id][mt_id] = True else: stderr('WARNING: No MT for %s %s %d' % (row.username, doc_id, line_id)) # Output the results output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_valid, username_set) if output_to_console: console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
def extract_translations(dump_file, target_lang, ref_file_list, output_to_console): """ """ stderr('Loading references...') doc_to_ref = load_references(ref_file_list) doc_to_src = load_sources(ref_file_list) doc_to_timing = defaultdict(dict) session_id = 0 stderr('Loading database dump...') doc_to_user_txt = defaultdict(dict) doc_to_user_time = defaultdict(dict) doc_to_user_valid = defaultdict(dict) dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang) username_set = set() for row in dump_row_list: username_set.add(row.username) text_dict = json.loads(row.text) segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict) doc_name = url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = segment_times_from_log(log) segment_to_mt = initial_translations_from_imt_log(log) if row.interface == 'imt' else initial_translations_from_pe_log(log) for line_id in sorted(segment_to_tgt_txt.keys()): doc_id = '%s:%d' % (doc_name, line_id) user_id = row.username + ':' + row.interface mt_id = 'MT:mt' doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id] doc_to_user_time[doc_id][user_id] = segment_to_time[line_id] doc_to_user_valid[doc_id][user_id] = str2bool(row.valid) if line_id in segment_to_mt: doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id] doc_to_user_time[doc_id][mt_id] = 0.0 doc_to_user_valid[doc_id][mt_id] = True else: stderr('WARNING: No MT for %s %s %d' % (row.username, doc_id, line_id)) # Output the results output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_valid, username_set) if output_to_console: console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
sys.exit(-1) # Parse the command line dump_file = args[0] target_lang = args[1] gender_file = args[2] out_file_name = args[3] bleu_directory = args[4] if len(args) > 4 else None quality_prefix = args[5] if len(args) > 5 else 'sbleu' user_to_gender = imt_utils.load_gender_csv(gender_file) user_doc_to_sbleu = imt_utils.load_sbleu_files( bleu_directory, quality_prefix + '_ref') if bleu_directory else None user_doc_to_hbleu = imt_utils.load_sbleu_files( bleu_directory, quality_prefix + '_mt') if bleu_directory else None dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang) output_row_list = [] total_translation_time = defaultdict(Counter) user_order_to_time = defaultdict(list) # Load and process the database dump session_order = 0 condition_order = 0 last_user = None last_condition = None for i, row in enumerate(dump_row_list): if i > 0 and i % 10 == 0: sys.stdout.write('.') if i % 800 == 0: print tgt_text_dict = json.loads(row.text)
stderr('Usage: python %s dump_file tgt_lang gender_csv out_file [bleu_directory] [prefix]' % (basename(sys.argv[0]))) stderr('bleu_directory : output of the extract translations script') sys.exit(-1) # Parse the command line dump_file = args[0] target_lang = args[1] gender_file = args[2] out_file_name = args[3] bleu_directory = args[4] if len(args) > 4 else None quality_prefix = args[5] if len(args) > 5 else 'sbleu' user_to_gender = imt_utils.load_gender_csv(gender_file) user_doc_to_sbleu = imt_utils.load_sbleu_files(bleu_directory, quality_prefix+'_ref') if bleu_directory else None user_doc_to_hbleu = imt_utils.load_sbleu_files(bleu_directory, quality_prefix+'_mt') if bleu_directory else None dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang) output_row_list = [] total_translation_time = defaultdict(Counter) user_order_to_time = defaultdict(list) # Load and process the database dump session_order = 0 condition_order = 0 last_user = None last_condition = None for i,row in enumerate(dump_row_list): if i > 0 and i % 10 == 0: sys.stdout.write('.') if i % 800 == 0: print tgt_text_dict = json.loads(row.text)