def align_samples(model, lang1, lang2, art): """TODO: Docstring for align_samples. :model: TODO :lang1: TODO :lang2: TODO :art: TODO :returns: TODO """ model = YalignModel.load(model) try: doc1 = text_to_document(remove_extra_spaces(art['orig'][lang1])) doc2 = text_to_document(remove_extra_spaces(art['orig'][lang2])) pairs = model.align(doc1, doc2) except: log.error('Error aligning %s', art['path']) else: # to_text returns <str> pairs = [(p1.to_text(), p2.to_text()) for p1, p2 in pairs] text1 = ''.join(p[0] + '\n' for p in pairs) text2 = ''.join(p[1] + '\n' for p in pairs) path1 = os.path.join(art['path'], '{}_yalign'.format(lang1)) path2 = os.path.join(art['path'], '{}_yalign'.format(lang2)) with open(path1, 'w') as f: f.write(text1) with open(path2, 'w') as f: f.write(text2) finally: del model gc.collect()
def _align_text(art_pair): art1, art2 = art_pair try: doc1 = text_to_document(remove_extra_spaces(art1['text'])) doc2 = text_to_document(remove_extra_spaces(art2['text'])) pairs = model.align(doc1, doc2) except: return None else: pairs = [(p[0].to_text(), p[1].to_text()) for p in pairs] text1 = [p[0] for p in pairs] text1 = [t+'\n' for t in text1] text1 = ''.join(text1) text2 = [p[1] for p in pairs] text2 = [t+'\n' for t in text2] text2 = ''.join(text2) return (text1, text2)
def get_similarity(lang1, lang2, sample_path, threshold, penalty, model, len_flt, sim_flt): """@todo: Docstring for get_similarity. :sample_path: @todo :threshold: @todo :penalty: @todo :model: @todo :returns: @todo """ with model_lock: model_conf_path = os.path.join(model, 'metadata.json') update_model(model_conf_path, dict(threshold=threshold, penalty=penalty)) model = YalignModel.load(model) similarity = [] for sample in iter_samples(sample_path): try: doc1 = text_to_document(remove_extra_spaces(sample['orig'][lang1])) doc2 = text_to_document(remove_extra_spaces(sample['orig'][lang2])) pairs = model.align(doc1, doc2) except: similarity.append(0) else: pairs = [(p[0].to_text(), p[1].to_text()) for p in pairs] text1 = [p[0] for p in pairs] text1 = [t+'\n' for t in text1] text1 = ''.join(text1) text2 = [p[1] for p in pairs] text2 = [t+'\n' for t in text2] text2 = ''.join(text2) try: text2 = text2.decode('utf-8') except UnicodeEncodeError: del model gc.collect() return 0 similarity.append(line_sim(text2, sample['human'][lang2], len_flt, sim_flt)) del model gc.collect() return sum(similarity)/len(similarity)