def get_edit_distances(tgt_segments, ref_segments):
    """

    Args:
    Returns:
    Raises:
    """
    edit_distances = []
    for tgt_line, ref_line in zip(tgt_segments, ref_segments):
        distance = dameraulevenshtein(ref_line, tgt_line, True)
        edit_distances.append(distance)
    return edit_distances
def get_edit_distances(tgt_segments, ref_segments):
    """

    Args:
    Returns:
    Raises:
    """
    edit_distances = []
    for tgt_line,ref_line in zip(tgt_segments,ref_segments):
        distance = dameraulevenshtein(ref_line,tgt_line,True)
        edit_distances.append(distance)
    return edit_distances
Example #3
0
def run_test(ref_file, tgt_list, tgt_meta_list):
    """

    Args:
    Returns:
    Raises:
    """
    ref_segments = load_segments(ref_file)

    # Sufficient statistics
    counts_lev_a = Counter()
    nums_lev_a = Counter()
    counts_lev_b = Counter()
    nums_lev_b = Counter()
    counts_dlev_a = Counter()
    nums_dlev_a = Counter()
    counts_dlev_b = Counter()
    nums_dlev_b = Counter()
    for (tgt_file, meta_file) in zip(tgt_list, tgt_meta_list):
        tgt_segments = load_segments(tgt_file)
        ui_ids = ids_from_meta_file(meta_file)
        for i, tgt_txt in enumerate(tgt_segments):
            ref_txt = ref_segments[i]
            lev_dist = levenshtein(ref_txt, tgt_txt, True)
            dlev_dist = dameraulevenshtein(ref_txt, tgt_txt, True)
            ui_id = ui_ids[i]
            if ui_id == 1:
                counts_lev_a[i] += lev_dist
                counts_dlev_a[i] += dlev_dist
                nums_lev_a[i] += 1
                nums_dlev_a[i] += 1
            elif ui_id == 2:
                counts_lev_b[i] += lev_dist
                counts_dlev_b[i] += dlev_dist
                nums_lev_b[i] += 1
                nums_dlev_b[i] += 1
            else:
                raise RuntimeError

    print 'Levenshtein distance'
    diff_test(counts_lev_a, nums_lev_a, counts_lev_b, nums_lev_b)
    print
    print 'Damerau-Levenshtein distance'
    diff_test(counts_dlev_a, nums_dlev_a, counts_dlev_b, nums_dlev_b)
def run_test(ref_file, tgt_list, tgt_meta_list):
    """

    Args:
    Returns:
    Raises:
    """
    ref_segments = load_segments(ref_file)

    # Sufficient statistics
    counts_lev_a = Counter()
    nums_lev_a = Counter()
    counts_lev_b = Counter()
    nums_lev_b = Counter()
    counts_dlev_a = Counter()
    nums_dlev_a = Counter()
    counts_dlev_b = Counter()
    nums_dlev_b = Counter()
    for (tgt_file,meta_file) in zip(tgt_list,tgt_meta_list):
        tgt_segments = load_segments(tgt_file)
        ui_ids = ids_from_meta_file(meta_file)
        for i,tgt_txt in enumerate(tgt_segments):
            ref_txt = ref_segments[i]
            lev_dist = levenshtein(ref_txt,tgt_txt,True)
            dlev_dist = dameraulevenshtein(ref_txt,tgt_txt,True)
            ui_id = ui_ids[i]
            if ui_id == 1:
                counts_lev_a[i] += lev_dist
                counts_dlev_a[i] += dlev_dist
                nums_lev_a[i] += 1
                nums_dlev_a[i] += 1
            elif ui_id == 2:
                counts_lev_b[i] += lev_dist
                counts_dlev_b[i] += dlev_dist
                nums_lev_b[i] += 1
                nums_dlev_b[i] += 1
            else:
                raise RuntimeError

    print 'Levenshtein distance'
    diff_test(counts_lev_a, nums_lev_a, counts_lev_b, nums_lev_b)
    print
    print 'Damerau-Levenshtein distance'
    diff_test(counts_dlev_a, nums_dlev_a, counts_dlev_b, nums_dlev_b)
Example #5
0
    segment_to_time = imt_utils.segment_times_from_log(log)
    segment_to_mt = imt_utils.initial_translations_from_imt_log(
        log
    ) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log(
        log)
    segment_to_src_txt = imt_utils.source_segments_from_log(log)
    doc_name = imt_utils.url2doc(row.src_doc)
    doc_genre = imt_utils.genre_from_url(row.src_doc)

    for line_id in sorted(segment_to_tgt_txt.keys()):
        # TODO: hack for a user with bad logs
        edist = 0
        if line_id in segment_to_mt:
            mt_tgt_txt = segment_to_mt[line_id]
            user_tgt_txt = segment_to_tgt_txt[line_id]
            edist = edit_distance.dameraulevenshtein(mt_tgt_txt, user_tgt_txt,
                                                     True)
        segment_id = '%s:%d' % (doc_name, line_id)
        time = segment_to_time[line_id]
        total_translation_time[row.username][row.interface] += time
        total_translation_time[row.username][row.interface + '_nseg'] += 1
        order = int(row.order)
        if not (last_user or last_condition):
            last_user = row.username
            last_condition = row.interface
        if row.username != last_user:
            session_order = 0
            condition_order = 0
        elif last_condition != row.interface:
            condition_order = 0
        time_key = '%s:%d' % (row.username, order)
        user_order_to_time[time_key].append(time)
    doc_name = imt_utils.url2doc(row.src_doc)
    log = json.loads(row.log)
    segment_to_time = imt_utils.segment_times_from_log(log)
    segment_to_mt = imt_utils.initial_translations_from_imt_log(log) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log(log)
    segment_to_src_txt = imt_utils.source_segments_from_log(log)
    doc_name = imt_utils.url2doc(row.src_doc)
    doc_genre = imt_utils.genre_from_url(row.src_doc)

    for line_id in sorted(segment_to_tgt_txt.keys()):
        # TODO: hack for a user with bad logs
        edist = 0
        if line_id in segment_to_mt:
            mt_tgt_txt = segment_to_mt[line_id]
            user_tgt_txt = segment_to_tgt_txt[line_id]
            edist = edit_distance.dameraulevenshtein(mt_tgt_txt,
                                                     user_tgt_txt,
                                                     True)
        segment_id = '%s:%d' % (doc_name, line_id)
        time = segment_to_time[line_id]
        total_translation_time[row.username][row.interface] += time
        total_translation_time[row.username][row.interface+'_nseg'] += 1
        order = int(row.order)
        if not (last_user or last_condition):
            last_user = row.username
            last_condition = row.interface
        if row.username != last_user:
            session_order = 0
            condition_order = 0
        elif last_condition != row.interface:
            condition_order = 0
        time_key = '%s:%d' % (row.username,order)