def infer_lexstat(datasets_dir, output_dir): """ Finds the datasets in the given dir and runs the _infer_lexstat function on each of these, specifying the correct lingpy transcription schema. """ for dataset_path in find_all_datasets(datasets_dir): name = os.path.basename(dataset_path).split('.')[0] output_path = os.path.join(output_dir, '{}.lsCC.csv'.format(name)) schema = 'asjp' if is_asjp_data(load_data(dataset_path)) else 'ipa' with set_schema(schema): _infer_lexstat(dataset_path, output_path)
def patch_targets(dataset_path): """ Re-outputs the dataset's targets. """ data = load_data(dataset_path) sample_keys = [] lang_pairs = [(a, b) for a in data.keys() for b in data.keys() if a < b] for lang1, lang2 in lang_pairs: syn, _ = get_pairs(lang1, lang2, data) sample_keys.extend(list(syn.keys())) return load_targets(dataset_path, sample_keys, data.keys())
def check(dataset_path, params_dir): """ Performs a dry run of the prepare command. Returns a helpful string reporting the results of the performed checks. The last check takes too long to be practical, so it is disabled. """ params = load_params(params_dir) data = load_data(dataset_path) report = [ check_asjp_conversion(data, params), check_pmi(data, params), check_load_targets(data, dataset_path), check_lexstat(data, dataset_path) ] return '\n\n'.join(report)
def patch_lexstat(dataset_path, vectors_path): """ Re-calculates the dataset's LexStat scores. """ data = load_data(dataset_path) all_langs = list(data.keys()) lang_pairs = [(a, b) for a in all_langs for b in all_langs if a < b] lexstat_samples = {} schema = 'asjp' if is_asjp_data(data) else 'ipa' with set_schema(schema): lingpy_wordlist = make_wordlist(data, dataset_path, schema) for lang1, lang2 in lang_pairs: scores = calc_lexstat(lang1, lang2, lingpy_wordlist) for key, score in scores.items(): key = explode_sample_id(key, all_langs) lexstat_samples[key] = list(score) gloss_d = {} # gloss to global gloss id with open(dataset_path) as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: gloss_d[row['gloss']] = row['global_id'] vectors = [] with open(vectors_path, newline='', encoding='utf-8') as f: vectors = [row for row in csv.DictReader(f)] for vector in vectors: assert vector['l1'] < vector['l2'] subkey = (gloss_d[vector['gloss']], vector['l1'], vector['l2']) pots = [key for key in lexstat_samples.keys() if key[:3] == subkey] scores = lexstat_samples.pop(pots[0]) vector['lexstat_simAA'] = scores[0] vector['lexstat_simBB'] = scores[1] vector['lexstat_simAB'] = scores[2] assert len(lexstat_samples) == 0 frame = pd.DataFrame(vectors, columns=VECTORS_COLS) frame.to_csv(vectors_path, index=False, float_format='%.10f')
def test_is_asjp_data(self): self.assertFalse(is_asjp_data(load_data(FIXTURE_DATASET))) self.assertTrue(is_asjp_data(load_data(FIXTURE_DATASET_ASJP)))
def setUp(self): self.data = load_data(FIXTURE_DATASET) self.data_asjp = load_data(FIXTURE_DATASET_ASJP)
def setUp(self): self.params = load_params(PARAMS_DIR) self.data = load_data(FIXTURE_DATASET)