def extract_translations(dump_file, target_lang, ref_file_list,
                         output_to_console):
    """
    """
    stderr('Loading references...')
    doc_to_ref = load_references(ref_file_list)
    doc_to_src = load_sources(ref_file_list)
    doc_to_timing = defaultdict(dict)
    session_id = 0

    stderr('Loading database dump...')
    doc_to_user_txt = defaultdict(dict)
    doc_to_user_time = defaultdict(dict)
    doc_to_user_valid = defaultdict(dict)
    dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang)
    username_set = set()
    for row in dump_row_list:
        username_set.add(row.username)
        text_dict = json.loads(row.text)
        segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict)
        doc_name = url2doc(row.src_doc)
        log = json.loads(row.log)
        segment_to_time = segment_times_from_log(log)
        segment_to_mt = initial_translations_from_imt_log(
            log
        ) if row.interface == 'imt' else initial_translations_from_pe_log(log)
        for line_id in sorted(segment_to_tgt_txt.keys()):
            doc_id = '%s:%d' % (doc_name, line_id)
            user_id = row.username + ':' + row.interface
            mt_id = 'MT:mt'
            doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id]
            doc_to_user_time[doc_id][user_id] = segment_to_time[line_id]
            doc_to_user_valid[doc_id][user_id] = str2bool(row.valid)
            if line_id in segment_to_mt:
                doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id]
                doc_to_user_time[doc_id][mt_id] = 0.0
                doc_to_user_valid[doc_id][mt_id] = True
            else:
                stderr('WARNING: No MT for %s %s %d' %
                       (row.username, doc_id, line_id))

    # Output the results
    output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt,
                        doc_to_user_valid, username_set)
    if output_to_console:
        console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
def extract_translations(dump_file,
                         target_lang,
                         ref_file_list,
                         output_to_console):
    """
    """
    stderr('Loading references...')
    doc_to_ref = load_references(ref_file_list)
    doc_to_src = load_sources(ref_file_list)
    doc_to_timing = defaultdict(dict)
    session_id = 0

    stderr('Loading database dump...')
    doc_to_user_txt = defaultdict(dict)
    doc_to_user_time = defaultdict(dict)
    doc_to_user_valid = defaultdict(dict)
    dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang)
    username_set = set()
    for row in dump_row_list:
        username_set.add(row.username)
        text_dict = json.loads(row.text)
        segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict)
        doc_name = url2doc(row.src_doc)
        log = json.loads(row.log)
        segment_to_time = segment_times_from_log(log)
        segment_to_mt = initial_translations_from_imt_log(log) if row.interface == 'imt' else initial_translations_from_pe_log(log)
        for line_id in sorted(segment_to_tgt_txt.keys()):
            doc_id = '%s:%d' % (doc_name, line_id)
            user_id = row.username + ':' + row.interface
            mt_id = 'MT:mt'
            doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id]
            doc_to_user_time[doc_id][user_id] = segment_to_time[line_id]
            doc_to_user_valid[doc_id][user_id] = str2bool(row.valid)
            if line_id in segment_to_mt:
                doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id]
                doc_to_user_time[doc_id][mt_id] = 0.0
                doc_to_user_valid[doc_id][mt_id] = True
            else:
                stderr('WARNING: No MT for %s %s %d' % (row.username,
                                                        doc_id,
                                                        line_id))

    # Output the results
    output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_valid, username_set)
    if output_to_console:
        console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
Esempio n. 3
0
    sys.exit(-1)

# Parse the command line
dump_file = args[0]
target_lang = args[1]
gender_file = args[2]
out_file_name = args[3]
bleu_directory = args[4] if len(args) > 4 else None
quality_prefix = args[5] if len(args) > 5 else 'sbleu'

user_to_gender = imt_utils.load_gender_csv(gender_file)
user_doc_to_sbleu = imt_utils.load_sbleu_files(
    bleu_directory, quality_prefix + '_ref') if bleu_directory else None
user_doc_to_hbleu = imt_utils.load_sbleu_files(
    bleu_directory, quality_prefix + '_mt') if bleu_directory else None
dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang)
output_row_list = []
total_translation_time = defaultdict(Counter)
user_order_to_time = defaultdict(list)

# Load and process the database dump
session_order = 0
condition_order = 0
last_user = None
last_condition = None
for i, row in enumerate(dump_row_list):
    if i > 0 and i % 10 == 0:
        sys.stdout.write('.')
        if i % 800 == 0:
            print
    tgt_text_dict = json.loads(row.text)
    stderr('Usage: python %s dump_file tgt_lang gender_csv out_file [bleu_directory] [prefix]' % (basename(sys.argv[0])))
    stderr('bleu_directory : output of the extract translations script')
    sys.exit(-1)

# Parse the command line
dump_file = args[0]
target_lang = args[1]
gender_file = args[2]
out_file_name = args[3]
bleu_directory = args[4] if len(args) > 4 else None
quality_prefix = args[5] if len(args) > 5 else 'sbleu'

user_to_gender = imt_utils.load_gender_csv(gender_file)
user_doc_to_sbleu = imt_utils.load_sbleu_files(bleu_directory, quality_prefix+'_ref') if bleu_directory else None
user_doc_to_hbleu = imt_utils.load_sbleu_files(bleu_directory, quality_prefix+'_mt') if bleu_directory else None
dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang)
output_row_list = []
total_translation_time = defaultdict(Counter)
user_order_to_time = defaultdict(list)

# Load and process the database dump
session_order = 0
condition_order = 0
last_user = None
last_condition = None
for i,row in enumerate(dump_row_list):
    if i > 0 and i % 10 == 0:
        sys.stdout.write('.')
        if i % 800 == 0:
            print
    tgt_text_dict = json.loads(row.text)