def generate_dataset():
    with open(WORKS_DISTANCE_DATASET_CMP_FILE, 'w', encoding='utf8') as works_distance_dataset_file_augm:
        with open(WORKS_DISTANCE_DATASET_FILE, 'w', encoding='utf8') as works_distance_dataset_file:
            works_distance_dataset = csv.writer(works_distance_dataset_file, lineterminator='\n')
            works_distance_dataset_augm = csv.writer(works_distance_dataset_file_augm, lineterminator='\n')
            with open(WORKS_DATASET_FILE, 'r', encoding='utf-8') as works_dataset_file:
                works_dataset = csv.reader(works_dataset_file)
                current_row = 0
                row1 = None
                for row in works_dataset:
                    if current_row % 500 == 0:
                        print('Processed %s rows' % current_row)
                    if current_row == 0:
                        document_fields = row
                        works_distance_dataset.writerow(row)
                        works_distance_dataset_augm.writerow(row)
                        row = next(works_dataset)
                    if current_row % 2 == 0:
                        row1 = row
                    else:
                        distance_array = WDDistance.get_scaled_distance_array(
                                row1[:len(row)], row[:len(row)], document_fields, match_strategy)
                        works_distance_dataset_augm.writerow(row1)
                        works_distance_dataset_augm.writerow(row)
                        works_distance_dataset_augm.writerow(
                                distance_array)
                        works_distance_dataset.writerow(
                                distance_array)
                    current_row += 1
                    if max_rows != -1 and current_row >= max_rows:
                        break
 def test_get_fuzzy_distance(self):
     # print(WDDistance.get_fuzzy_distance('test', 'tset'))
     # print(WDDistance.get_fuzzy_distance('test and nottest', 'test & nottest'))
     print(WDDistance.get_fuzzy_distance('Freund', 'FREUND PUBLISHING HOUSE, LTD.'))
     print(WDDistance.get_fuzzy_distance('nature publishing group', 'elsevier science'))