Exemple #1
0
def infer_lexstat(datasets_dir, output_dir):
    """
	Finds the datasets in the given dir and runs the _infer_lexstat function on
	each of these, specifying the correct lingpy transcription schema.
	"""
    for dataset_path in find_all_datasets(datasets_dir):
        name = os.path.basename(dataset_path).split('.')[0]
        output_path = os.path.join(output_dir, '{}.lsCC.csv'.format(name))

        schema = 'asjp' if is_asjp_data(load_data(dataset_path)) else 'ipa'
        with set_schema(schema):
            _infer_lexstat(dataset_path, output_path)
Exemple #2
0
def patch_targets(dataset_path):
    """
	Re-outputs the dataset's targets.
	"""
    data = load_data(dataset_path)
    sample_keys = []

    lang_pairs = [(a, b) for a in data.keys() for b in data.keys() if a < b]
    for lang1, lang2 in lang_pairs:
        syn, _ = get_pairs(lang1, lang2, data)
        sample_keys.extend(list(syn.keys()))

    return load_targets(dataset_path, sample_keys, data.keys())
Exemple #3
0
def check(dataset_path, params_dir):
	"""
	Performs a dry run of the prepare command. Returns a helpful string
	reporting the results of the performed checks.
	
	The last check takes too long to be practical, so it is disabled.
	"""
	params = load_params(params_dir)
	data = load_data(dataset_path)
	
	report = [
		check_asjp_conversion(data, params),
		check_pmi(data, params),
		check_load_targets(data, dataset_path),
		check_lexstat(data, dataset_path)
	]
	
	return '\n\n'.join(report)
Exemple #4
0
def patch_lexstat(dataset_path, vectors_path):
    """
	Re-calculates the dataset's LexStat scores.
	"""
    data = load_data(dataset_path)

    all_langs = list(data.keys())
    lang_pairs = [(a, b) for a in all_langs for b in all_langs if a < b]

    lexstat_samples = {}

    schema = 'asjp' if is_asjp_data(data) else 'ipa'
    with set_schema(schema):
        lingpy_wordlist = make_wordlist(data, dataset_path, schema)

        for lang1, lang2 in lang_pairs:
            scores = calc_lexstat(lang1, lang2, lingpy_wordlist)
            for key, score in scores.items():
                key = explode_sample_id(key, all_langs)
                lexstat_samples[key] = list(score)

    gloss_d = {}  # gloss to global gloss id
    with open(dataset_path) as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            gloss_d[row['gloss']] = row['global_id']

    vectors = []
    with open(vectors_path, newline='', encoding='utf-8') as f:
        vectors = [row for row in csv.DictReader(f)]

    for vector in vectors:
        assert vector['l1'] < vector['l2']
        subkey = (gloss_d[vector['gloss']], vector['l1'], vector['l2'])
        pots = [key for key in lexstat_samples.keys() if key[:3] == subkey]
        scores = lexstat_samples.pop(pots[0])
        vector['lexstat_simAA'] = scores[0]
        vector['lexstat_simBB'] = scores[1]
        vector['lexstat_simAB'] = scores[2]

    assert len(lexstat_samples) == 0

    frame = pd.DataFrame(vectors, columns=VECTORS_COLS)
    frame.to_csv(vectors_path, index=False, float_format='%.10f')
Exemple #5
0
	def test_is_asjp_data(self):
		self.assertFalse(is_asjp_data(load_data(FIXTURE_DATASET)))
		self.assertTrue(is_asjp_data(load_data(FIXTURE_DATASET_ASJP)))
Exemple #6
0
 def setUp(self):
     self.data = load_data(FIXTURE_DATASET)
     self.data_asjp = load_data(FIXTURE_DATASET_ASJP)
Exemple #7
0
 def setUp(self):
     self.params = load_params(PARAMS_DIR)
     self.data = load_data(FIXTURE_DATASET)