def run(align_func, dataset_path, output_path): """ Read the word pairs from a psa dataset, align them using the given func, and write an output psa dataset. If there is a word that cannot be converted to ASJP, an obviously wrong alignment is output. """ dataset = AlignmentsDataset(dataset_path) output = ['{} (PMI alignment)'.format(dataset.header)] for word_a, word_b, original_al in dataset.data: try: asjp_a = ipa2asjp(word_a.ipa) asjp_b = ipa2asjp(word_b.ipa) except ValueError: output.extend([ original_al.comment, '\t'.join([word_a.lang, '-'] + list(word_a.ipa)), '\t'.join([word_b.lang, '-'] + list(word_b.ipa)), '' ]) continue for asjp_al in align_func(asjp_a, asjp_b): ipa_corr = convert_alignment(word_a.ipa, word_b.ipa, asjp_al.corr) output.extend([ original_al.comment, '\t'.join([word_a.lang] + [pair[0] for pair in ipa_corr]), '\t'.join([word_b.lang] + [pair[1] for pair in ipa_corr]), '' ]) with open_for_writing(output_path) as f: f.write('\n'.join(output))
def write_alignments(alignments, path=None, header='OUTPUT'): """ Write a list of (Word, Word, Alignment) tuples to a psa file. The last element of each tuple should be an Alignment named tuple from either this or the align module. If path is None or '-', use stdout. """ lines = [header] for word_a, word_b, alignment in alignments: field_size = str(max(len(word_a.lang), len(word_b.lang))) lang_a = ('{:.<'+ field_size +'}').format(word_a.lang) lang_b = ('{:.<'+ field_size +'}').format(word_b.lang) align_a = [token if token else '-' for token, _ in alignment.corr] align_b = [token if token else '-' for _, token in alignment.corr] line_a = '\t'.join([lang_a] + align_a) line_b = '\t'.join([lang_b] + align_b) if hasattr(alignment, 'comment'): comment = alignment.comment else: comment = str(word_a.concept) lines.extend([comment, line_a, line_b, '']) with open_for_writing(path) as f: f.write('\n'.join(lines))
def write_pairs(path, word_pairs): """ Write a [] of pairs of IPA sequences to a file, one tab-separated pair per line, with the IPA tokens separated by intervals. """ output = [] for pair in word_pairs: output.append('\t'.join([' '.join(seq) for seq in pair])) with open_for_writing(path) as f: f.write('\n'.join(output))
def write_words(words, path=None, dialect='excel-tab', header=WordsDataset.DEFAULT_COLUMNS, tokenised=False): """ Write the words ([] of Word tuples) to a csv file using the given dialect. If path is None or '-', use stdout. The header arg should be a list of the headings for the language, concept, and transcription columns, respectively. If tokenised is set to True, separate the IPA tokens with spaces. """ joiner = ' ' if tokenised else '' with open_for_writing(path, newline='') as f: writer = csv.writer(f, dialect=dialect) writer.writerow(header) for word in words: writer.writerow([word.lang, word.concept, joiner.join(word.ipa)])
def run_sca(dataset_path, output_path): """ Read the word pairs from a psa dataset, align them using the SCA algorithm, and write an output psa dataset. """ dataset = AlignmentsDataset(dataset_path, keep_digits=True) output = ['{} (SCA alignment)'.format(dataset.header)] for word_a, word_b, alignment in dataset.data: sca = Pairwise(word_a.ipa, word_b.ipa, merge_vowels=False, merge_geminates=False) sca.align() align_a, align_b, _ = sca.alignments[0] output.extend([ alignment.comment, '\t'.join([word_a.lang] + align_a), '\t'.join([word_b.lang] + align_b), '' ]) with open_for_writing(output_path) as f: f.write('\n'.join(output))