Exemple #1
0
def align_samples(model, lang1, lang2, art):
    """TODO: Docstring for align_samples.

    :model: TODO
    :lang1: TODO
    :lang2: TODO
    :art: TODO
    :returns: TODO

    """
    model = YalignModel.load(model)
    try:
        doc1 = text_to_document(remove_extra_spaces(art['orig'][lang1]))
        doc2 = text_to_document(remove_extra_spaces(art['orig'][lang2]))
        pairs = model.align(doc1, doc2)
    except:
        log.error('Error aligning %s', art['path'])
    else:
        # to_text returns <str>
        pairs = [(p1.to_text(), p2.to_text()) for p1, p2 in pairs]
        text1 = ''.join(p[0] + '\n' for p in pairs)
        text2 = ''.join(p[1] + '\n' for p in pairs)
        path1 = os.path.join(art['path'], '{}_yalign'.format(lang1))
        path2 = os.path.join(art['path'], '{}_yalign'.format(lang2))
        with open(path1, 'w') as f:
            f.write(text1)
        with open(path2, 'w') as f:
            f.write(text2)
    finally:
        del model
        gc.collect()
Exemple #2
0
def _align_text(art_pair):
    art1, art2 = art_pair
    try:
        doc1 = text_to_document(remove_extra_spaces(art1['text']))
        doc2 = text_to_document(remove_extra_spaces(art2['text']))
        pairs = model.align(doc1, doc2)
    except:
        return None
    else:
        pairs = [(p[0].to_text(), p[1].to_text()) for p in pairs]
        text1 = [p[0] for p in pairs]
        text1 = [t+'\n' for t in text1]
        text1 = ''.join(text1)
        text2 = [p[1] for p in pairs]
        text2 = [t+'\n' for t in text2]
        text2 = ''.join(text2)
    return (text1, text2)
Exemple #3
0
def get_similarity(lang1, lang2, sample_path, threshold, penalty, model,
                   len_flt, sim_flt):
    """@todo: Docstring for get_similarity.

    :sample_path: @todo
    :threshold: @todo
    :penalty: @todo
    :model: @todo
    :returns: @todo

    """
    with model_lock:
        model_conf_path = os.path.join(model, 'metadata.json')
        update_model(model_conf_path, dict(threshold=threshold, penalty=penalty))
        model = YalignModel.load(model)
    similarity = []
    for sample in iter_samples(sample_path):
        try:
            doc1 = text_to_document(remove_extra_spaces(sample['orig'][lang1]))
            doc2 = text_to_document(remove_extra_spaces(sample['orig'][lang2]))
            pairs = model.align(doc1, doc2)
        except:
            similarity.append(0)
        else:
            pairs = [(p[0].to_text(), p[1].to_text()) for p in pairs]
            text1 = [p[0] for p in pairs]
            text1 = [t+'\n' for t in text1]
            text1 = ''.join(text1)
            text2 = [p[1] for p in pairs]
            text2 = [t+'\n' for t in text2]
            text2 = ''.join(text2)
            try:
                text2 = text2.decode('utf-8')
            except UnicodeEncodeError:
                del model
                gc.collect()
                return 0
            similarity.append(line_sim(text2, sample['human'][lang2], len_flt, sim_flt))
    del model
    gc.collect()
    return sum(similarity)/len(similarity)