Python soundex Examples

Programming Language: Python

Method/Function: soundex

Examples at hotexamples.com: 8

Python soundex - 8 examples found. These are the top rated real world Python examples of soundex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def bestMatch(token):
    # Convert the token with soundex
    soundexToken = soundex(token)

    # Find the exact same soundex code in the soundexDict, and save their index numbers
    candidateIndex = []
    for i in range(len(soundexDict)):
        if soundexToken == soundexDict[i]:
            candidateIndex.append(i)

    # Use the index numbers of the matches, extract the original words from the original dictionary
    candidateLs = []
    for i in candidateIndex:
        candidateLs.append(dictLs[i])

    # Use Levenshtein Distance (edit distance) to compare every potential match to the misspelled token,
    # and return the most similar one as the best match
    maxRatio = 0
    bestMatch = ""
    for i in candidateLs:
        # The higher the ratio, the more similar the two strings are
        ratio = lev.ratio(token, i)
        if ratio > maxRatio:
            maxRatio = ratio
            bestMatch = i
    return bestMatch

Example #2

Show file

def phonetic_candidates_soundex(word, d):
    word = word.lower()
    phonetic_representation = soundex(word)[1]
    # print(phonetic_representation)
    soundex_candidates = []
    if phonetic_representation in dict_inverted_soundex:
        word_list = dict_inverted_soundex[phonetic_representation]
        for w in word_list:
            soundex_candidates.append((w, word_threshold * d))

    return soundex_candidates

Example #3

Show file

File: purifier.py Project: jxjzhang/CS246

def phonetic_candidates_soundex(word, d):
	word=word.lower()
	phonetic_representation=soundex(word)[1]
	# print(phonetic_representation)
	soundex_candidates=[]
	if phonetic_representation in dict_inverted_soundex:
		word_list = dict_inverted_soundex[phonetic_representation]
		for w in word_list:
			soundex_candidates.append((w, word_threshold*d))
		
	return soundex_candidates

Example #4

Show file

 def __init__(self, corpus_name):
     logger.info(f"initialize index for file {corpus_name}")
     self.file_name = corpus_name
     self.soundex = soundex(N__GRAM)
     self.algo_ref = {
         'levenshtein': levenshtein,
         'c_levenshtein': editdistance.eval,
         'lcs': lcs,
         'ngrams': ngrams_match
     }
     self.load_corpus()

Example #5

Show file

def main(argv):

    parser = argparse.ArgumentParser(
        description=
        'Merge split WMO Publication 47 metadata files to one file per country'
    )
    parser.add_argument( "-config", dest = "config", required = False, \
                          default = "config.json", help = "JSON file containing configuration settings")
    parser.add_argument( "-jobs", dest = "jobs", required = True, \
                          default = "jobs.json", help = "JSON file containing configuration and list of jobs to run")
    parser.add_argument("-countries", dest="country_file", required=False, \
                         help="JSON file containing list of countries to process", default = None)
    parser.add_argument("-index", dest="index", required=False,  type = int,  \
                         help="Index of country to process", default = None)
    parser.add_argument("-country", dest="country", required=False, \
                         help="2 character country code to process", default = None)

    #parser.add_argument( "-log", dest="log_path", required=False, default='./', \
    #                     help = "Directory to write log files to")
    #parser.add_argument( "-tag", dest="tag", required=False, default='', \
    #                     help = "Tag appended to log files")

    # add argument to specify index / position in list of countries.
    args = parser.parse_args()
    control_file = args.jobs
    config_file = args.config
    country_file = args.country_file
    country = args.country
    country_index = args.index - 1
    #log_path = args.log_path

    if country_file is None and country is None:
        print("Error, one of countries or country must be supplied")
        assert False

    if country_file is not None and country is not None:
        print("Error, only one of countries or country must be supplied")
        assert False

    # load config options
    with open(config_file) as cf:
        config = json.load(cf)

    with open(control_file) as s:
        control = json.load(s)

    #datapath         = config['data_path']
    configpath = config['config_path']
    #verbose          = config['verbose']
    outputpath = config['output_path']
    #corrections_file = configpath + './' + config['corrections_file']

    map_file = config["mapping_path"] + "./pub47_common_names.json"

    fmiss = -999999.
    imiss = -1  # -999999

    with open(map_file) as m:
        mapping = json.load(m)

    if country_file is not None:
        with open(country_file) as m:
            countries = json.load(m)
        if country_index is not None:
            country = countries[country_index]
            countries = list()
            countries.append(country)
    else:
        countries = list()
        countries.append(country)

    # iterate over countries
    for country in countries:
        print("Processing " + country)
        master = pd.DataFrame()
        for job in control['jobs']:
            schema = pub47schema(configpath + './schemas/', job['schema'])
            input_file = outputpath + './split/' + os.path.basename(
                job['data_file']) + "." + country

            # files now only exist for country if data in them, warn if no file found.
            if not os.path.isfile(input_file):
                print('{} not found'.format(input_file))
                continue
            else:
                print(' ... {} '.format(input_file))
            # load data
            datain = pd.read_csv(input_file, sep='|',
                                 dtype='object')  # read as object
            datain = datain.drop_duplicates(
                keep='first')  # some duplicates are appearing from somewhere !
            # check whether we need to handle columns that have been split
            # split_columns = (len( schema.split_fields ) > 0)

            # NOTE: only text / object columns split so don't need to convert those columns
            # need to check / revise this in the future
            # convert to expected data type
            numeric_columns = list()
            columns_processed = list()
            for column in schema.column_name:
                columns_processed.append(column)
                if schema.column_type[column] == 'int':
                    datain[column].replace(cmiss, str(imiss), inplace=True)
                    datain = datain.astype({column: 'int'})
                elif schema.column_type[column] == 'float':
                    datain[column].replace(cmiss, str(fmiss), inplace=True)
                    datain = datain.astype({column: 'float'})
                    numeric_columns.append(column)

            # convert numeric_columns variable to set for later use
            numeric_columns = set(numeric_columns)

            # fill all NAs with fmiss (-99999)
            #datain.fillna( fmiss , inplace = True)

            # convert valid_from and valid_to to datetime objects, these are not in the schema but added in the first
            # step of processing
            datain['valid_from'] = pd.to_datetime(datain['valid_from'])
            datain['valid_to'] = pd.to_datetime(datain['valid_to'])

            # identify which mapping to use
            version = "v" + str(schema.version)
            mapUse = mapping[version][1]
            invMap = dict([[v, k] for k, v in mapUse.items()])

            # map columns in input data to output required (store in tmp df)
            tmpDf = datain.copy()
            tmpDf = tmpDf.assign(source_file=input_file)
            tmpDf = tmpDf.assign(alt_names='')

            # check if year present, if not set to year from schema
            if not ('year' in tmpDf):
                tmpDf = tmpDf.assign(year=job['year'])

            tmpDf = tmpDf.assign(month=job['month'])
            tmpDf = tmpDf.assign(publication_frequency=job['freq'])

            # rename columns to that expected in output schema
            colNewNames = dict()
            for column in tmpDf:
                if column in invMap:
                    colNewNames[column] = invMap[column]

            tmpDf.rename(columns=colNewNames, inplace=True)

            # regularise ship names, first need to fill null strings with 'NULL'
            tmpDf['name'] = tmpDf['name'].fillna('NULL')
            # replace double spaces with single
            tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply(
                lambda x: re.sub("\\s\\+", " ", x))
            # now single initials with initial., e.g. A with A.
            tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply(
                lambda x: re.sub("( )([A-Z]{1})( )", "\\1\\2.\\3", x))
            # finally, add space between dot and letters, e.g. A.ABC with A. ABC
            tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply(
                lambda x: re.sub("([A-Z]{1})(\\.)([A-Z]{1})", "\\1\\2 \\3", x))

            sx = tmpDf['name'].apply(lambda x: soundex(x))
            tmpDf = tmpDf.assign(sx=sx)
            tmpDf = tmpDf.assign(record_number=1)
            to_add = []
            # now check each callsign and ship name to see if records can be merged with existing
            if master.shape[0] > 0:
                print("   input_file: {}  ".format(input_file))
                print(tmpDf['callsign'].dtype)
                for idx in tmpDf.index.values:
                    action = 'new_record'
                    id = tmpDf.loc[idx, 'callsign']
                    if id == cmiss:
                        continue
                    shipname = tmpDf.loc[idx, 'name']
                    matches = master.loc[(master['callsign'] == id)].copy()
                    if matches.shape[0] > 0:

                        # get last record added
                        max_record = max(matches['record_number'])
                        id_match = matches[matches['record_number'] ==
                                           max_record].index.values

                        # now get similarity in names, either by soundex or type
                        distance = max( float(matches['sx'][id_match[0]] == tmpDf.loc[idx, 'sx' ]),\
                                        Levenshtein.ratio(matches['name'][id_match[0]], shipname) )

                        # if close match check elements
                        if distance > 0.8:
                            # get list of common fields between new entry and matches
                            common = list(
                                set(list(tmpDf)).intersection(
                                    list(matches)).intersection(
                                        config['duplicateChecks']))

                            # perform merge
                            # idx = row in current file
                            # id_match = row in master data frame

                            # if rows are the same excluding missing data, merge copying missing data
                            # else if rows are different add new row.

                            # get list of matching elements (TRUE|FALSE)
                            matching_elements = tmpDf.loc[
                                idx, common] == matches.loc[id_match[0],
                                                            common]
                            # possible actions
                            #  - merge and fill
                            #  - merge and correct
                            #  - keep old, increment dates
                            #  - add new
                            if matching_elements.all(
                            ):  # exact match, merge dates and files
                                action = 'increment_date'
                                min_date = min({
                                    tmpDf.loc[idx, 'valid_from'],
                                    matches.loc[id_match[0], 'valid_from']
                                })
                                max_date = max({
                                    tmpDf.loc[idx, 'valid_to'],
                                    matches.loc[id_match[0], 'valid_to']
                                })
                                #master.at[matches.index[0], 'valid_to'] = max_date
                                #master.at[matches.index[0], 'valid_from'] = min_date
                                master.at[id_match[0], 'valid_to'] = max_date
                                master.at[id_match[0], 'valid_from'] = min_date
                                master.at[id_match[0],
                                          'source_file'] = master.loc[
                                              id_match[0],
                                              'source_file'] + ';' + tmpDf.loc[
                                                  idx, 'source_file']
                                if (tmpDf.loc[idx, 'name'] !=
                                        master.loc[id_match[0], 'name']) & (
                                            tmpDf.loc[idx, 'name']
                                            not in master.loc[id_match[0],
                                                              'alt_names']):
                                    master.at[
                                        id_match[0], 'alt_names'] = master.loc[
                                            id_match[0],
                                            'alt_names'] + ';' + tmpDf.loc[
                                                idx, 'name']
                            else:
                                # remove missing elements and recheck
                                missing_left = (
                                    (tmpDf.loc[idx, common] == cmiss) |
                                    (tmpDf.loc[idx, common] == imiss) |
                                    (tmpDf.loc[idx, common] == fmiss))
                                missing_right = (
                                    (matches.loc[id_match[0], common] == cmiss)
                                    |
                                    (matches.loc[id_match[0], common] == imiss)
                                    | (matches.loc[id_match[0], common]
                                       == fmiss))
                                missing = (missing_left | missing_right)
                                missing = (missing | matching_elements)
                                if missing.all():
                                    action = 'fill_missing'
                                    mismatch = ~matching_elements
                                    right_columns = missing_right.index[(
                                        missing_right & mismatch)].format()
                                    # set valid date range to span both records
                                    min_date = min({
                                        tmpDf.loc[idx, 'valid_from'],
                                        matches.loc[id_match[0], 'valid_from']
                                    })
                                    max_date = max({
                                        tmpDf.loc[idx, 'valid_to'],
                                        matches.loc[id_match[0], 'valid_to']
                                    })
                                    master.at[
                                        id_match[0],
                                        'source_file'] = master.loc[
                                            id_match[0],
                                            'source_file'] + ';' + tmpDf.loc[
                                                idx, 'source_file']
                                    if (tmpDf.loc[idx, 'name'] !=
                                            master.loc[id_match[0], 'name']
                                        ) & (tmpDf.loc[idx, 'name']
                                             not in master.loc[id_match[0],
                                                               'alt_names']):
                                        master.at[
                                            id_match[0],
                                            'alt_names'] = master.loc[
                                                id_match[0],
                                                'alt_names'] + ';' + tmpDf.loc[
                                                    idx, 'name']
                                    # now update master table
                                    master.at[id_match[0],
                                              'valid_to'] = max_date
                                    master.at[id_match[0],
                                              'valid_from'] = min_date
                                    # now fill master table (this is the one we keep)
                                    if len(right_columns) > 0:
                                        master.at[id_match[0],
                                                  right_columns] = tmpDf.loc[
                                                      idx, right_columns]
                                else:
                                    # now check numeric (float) elements
                                    mismatch = ~(matching_elements | missing)
                                    numeric_mismatch = numeric_columns.intersection(
                                        mismatch.index[mismatch].format())
                                    if len(numeric_mismatch) > 0:
                                        print(" **** Numeric mismatch **** ")
                                        print(tmpDf.loc[
                                            idx,
                                            pd.np.array(common)[mismatch]])
                                        print(matches.loc[
                                            id_match[0],
                                            pd.np.array(common)[mismatch]])
                                        action = 'correct_numeric'
                                    else:
                                        action = 'new_record'
                                        tmpDf.at[
                                            idx,
                                            'record_number'] = max_record + 1
                        else:
                            action = 'new_record'
                            tmpDf.at[idx, 'record_number'] = max_record + 1
                    if action == 'new_record':
                        to_add.append(idx)
            else:
                to_add = tmpDf.index.values
            # concat to master table
            master = pd.concat([master, tmpDf.loc[to_add, ]],
                               ignore_index=True,
                               sort=False)
            # replace nans with expected missing value
            for column in master:
                if master[column].dtype == 'datetime64[ns]':
                    continue
                if master[column].dtype == 'float64':
                    master[column].fillna(fmiss, inplace=True)
                elif master[column].dtype == 'object':
                    master[column].fillna(cmiss, inplace=True)
                elif master[column].dtype == 'int64':
                    master[column].fillna(imiss, inplace=True)
                else:
                    print('Unknown column type: {}'.format(
                        master[column].dtype))

        # final step is sort and addition of record numbers

        # assign UIDs to all records
        uid = master.apply(lambda x: '{}-{}-{}'.format(x['callsign'], x[
            'sx'], x['recruiting_country']),
                           axis=1)
        master = master.assign(uid=uid)

        # sort by id then date
        master.sort_values(['uid', 'valid_from'], inplace=True)

        # reset index
        master.reset_index(inplace=True, drop=True)

        # now reset record numbers based on uid
        uids = master['uid'].unique()
        count = 0
        for uid in uids:
            if count % 200 == 0:
                print('{} / {} '.format(count, len(uids)))
            records = master.loc[master['uid'] == uid, :]
            nrecs = records.shape[0]
            master.at[records.index, 'record_number'] = pd.np.arange(nrecs)
            # adjust valid from and to for 1st and last records
            new_valid_to = records.valid_from.shift(-1)
            to_change = (((records['valid_to'] - new_valid_to)).dt.days >= -3625) & \
                        (((records['valid_to'] - new_valid_to)).dt.days  <= 0)
            if to_change.any():
                records.loc[to_change, 'valid_to'] = new_valid_to[to_change]
            # add 5 years to last record and subtract 1 year from first
            records.loc[records.index[0], 'valid_from'] = records.loc[
                records.index[0], 'valid_from'] - relativedelta(months=12)
            records.loc[records.index[nrecs - 1],
                        'valid_to'] = records.loc[records.index[nrecs - 1],
                                                  'valid_to'] + relativedelta(
                                                      months=60)
            master.at[records.index, ['valid_from', 'valid_to']] = records.loc[
                records.index, ['valid_from', 'valid_to']]
            count += 1

        # now save
        # convert each field back to str and replace missing values with NULL
        master = master.astype(str)
        master.replace(str(fmiss), pd.np.nan, inplace=True)
        master.replace(str(imiss), pd.np.nan, inplace=True)
        master.to_csv(outputpath + './master/master.' + country + '.csv',
                      index=False,
                      sep='|',
                      na_rep='NULL')

Example #6

Show file

File: fuzzy.py Project: happytcj/Euphony

        rdr.next()

        for row in rdr:
            female_names.append(row[1])
            male_names.append(row[0])

parse_csv('/Users/TJiang/Desktop/name_in_english/names.csv')
print male_names
exit(0)

#soundex = fuzzy.Soundex(4)

hash_list = []

for n in male_names:
    print '%-10s' % n, soundex(n)
    hash_list.append(soundex(n))

user_name   = raw_input('enter desired name to match')
user_hash   = soundex(user_name)
gender      = 'm'
#gender      = raw_input('enter gender m/f')

same_score_list = []

top_match = -1;
for idx, one_hash in enumerate(hash_list):
    score = fuzz.ratio(user_hash, one_hash)
    if score > top_match:
        top_match = score
        del same_score_list[:]

Example #7

Show file

# Find the best match from the dictionary for a misspelled token

import Levenshtein as lev
from soundex import *

# Convert the dictionary with soundex. Referred as soundexDict
soundexDict = []
for i in dictLs:
    soundexDict.append(soundex(i))


def bestMatch(token):
    # Convert the token with soundex
    soundexToken = soundex(token)

    # Find the exact same soundex code in the soundexDict, and save their index numbers
    candidateIndex = []
    for i in range(len(soundexDict)):
        if soundexToken == soundexDict[i]:
            candidateIndex.append(i)

    # Use the index numbers of the matches, extract the original words from the original dictionary
    candidateLs = []
    for i in candidateIndex:
        candidateLs.append(dictLs[i])

    # Use Levenshtein Distance (edit distance) to compare every potential match to the misspelled token,
    # and return the most similar one as the best match
    maxRatio = 0
    bestMatch = ""
    for i in candidateLs:

Example #8

Show file

def main(argv):

    parser = argparse.ArgumentParser(
        description='Split WMO Publication 47 metadata files by country')
    parser.add_argument( "-config", dest = "config", required = False, \
                          default = "config.json", help = "JSON file containing configuration settings")
    parser.add_argument( "-jobs", dest = "jobs", required = True, \
                          default = "jobs.json", help = "JSON file containing list of jobs to run")
    parser.add_argument( "-start", dest="jobIndexStart", type=int, required=True, default=1, \
                         help = "Index of first job to process")
    parser.add_argument( "-end", dest="jobIndexEnd", type=int, required=False, default=None, \
                         help = "Index of last job to process, defaults to first job")
    parser.add_argument( "-log", dest="log_path", required=False, default='./', \
                         help = "Directory to write log files to")
    parser.add_argument( "-tag", dest="tag", required=False, default='', \
                         help = "Tag appended to log files")

    args = parser.parse_args()
    control_file = args.jobs
    first_job = args.jobIndexStart
    last_job = args.jobIndexEnd
    config_file = args.config
    log_path = args.log_path

    if last_job is None:
        last_job = first_job

    # set validity periods for different editions
    validity = {'annual': 12, 'quarterly': 3, 'semi-annual': 6}

    # global options stored in config file
    # jobs specific options in control_file (need to rename)

    # load config options
    with open(config_file) as cf:
        config = json.load(cf)

    # load controls / list if files to process
    with open(control_file) as s:
        control = json.load(s)

    # parsing using pandas

    # global options

    map_path = config['mapping_path']
    datapath = config['data_path']
    configpath = config['config_path']
    verbose = config['verbose']
    outputpath = config['output_path']
    corrections_file = configpath + './' + config['corrections_file']

    print(corrections_file)

    # read options from control file
    log_file = log_path + './split_pub47_' + args.tag + '.log'

    # load corrections
    with open(corrections_file) as m:
        corrections = json.load(m)

    # open log file for later use
    log = open(log_file, 'w')

    # iterate over jobs in control file
    for job_index in pd.np.arange(first_job, last_job + 1, 1):
        # find job in job list
        for job in control['jobs']:
            if job['jobindex'] == job_index:
                break
        assert job_index == job['jobindex']

        rejects = pd.DataFrame()

        # load schema
        schema = pub47schema(configpath + './schemas/', job['schema'])

        # get input file
        input_file = job['data_file']
        input_file = datapath + input_file

        # set validity dates
        valid_from = datetime.date(job['year'], job['month'], 1)
        valid_to = datetime.date(job['year'], job['month'], 1) + relativedelta(
            months=validity[job['freq']])

        # feedback
        if verbose > 0:
            print("Processing " + os.path.basename(input_file), file=log)

        # now read in the data
        datain = pub47load(schema, input_file, map_path)

        # remove any exact duplicates
        datain = datain.drop_duplicates(keep='first')

        # now we need to identify duplicates within country
        id_counts = datain.loc[:, 'call'].value_counts()
        duplicated_ids = id_counts.index.values[id_counts > 1]
        duplicated_ids = list(duplicated_ids[duplicated_ids != cmiss])
        unique_ids = list(id_counts.index.values[id_counts == 1])
        unique_ids.append(cmiss)
        unique_rows = datain.loc[
            datain['call'].apply(lambda x: x in unique_ids), :].copy()

        for dup_id in duplicated_ids:
            dup_rows = datain.loc[datain['call'] == dup_id, :]
            # more than two entries for same callsign, reject all for later assessment
            if dup_rows.shape[0] > 2:
                rejects = pd.concat([rejects, dup_rows],
                                    ignore_index=True,
                                    sort=False)
                continue
            cmp = dup_rows.apply(lambda x: pub47_record_completeness(x),
                                 axis=1)
            vsslM = dup_rows.loc[:, 'vsslM']
            most_complete = list(cmp[cmp == max(cmp)].index.values)
            highest_class = list(vsslM[vsslM == min(vsslM)].index.values)
            ix = dup_rows.index.values
            same_name = soundex(dup_rows.loc[ix[0], 'name']) == soundex(
                dup_rows.loc[ix[1], 'name'])
            same_country = dup_rows.loc[ix[0], schema.recruiting_country ] == \
                           dup_rows.loc[ix[1], schema.recruiting_country ]
            # if same country and name merge if possible
            # if different country but same name use highest VOS class
            # else mark for rejection as ambiguous
            if same_country and same_name:
                # check if we can merge
                if pub47_record_compare(
                        dup_rows.loc[ix[0], schema.duplicate_check],
                        dup_rows.loc[ix[1], schema.duplicate_check]):
                    record_to_add = dup_rows.loc[[most_complete[0]], ].copy()
                    merged_record = pub47_merge_rows(
                        dup_rows.loc[ix[0], schema.duplicate_check],
                        dup_rows.loc[ix[1], schema.duplicate_check])
                    # record_to_add.at[ ix[0], schema['duplicate_check'] ] \
                    merged_record = pd.DataFrame(merged_record).transpose()
                    record_to_add.reset_index(inplace=True, drop=True)
                    merged_record.reset_index(inplace=True, drop=True)
                    record_to_add.at[:, schema.
                                     duplicate_check] = merged_record.loc[:,
                                                                          schema
                                                                          .
                                                                          duplicate_check]
                elif len(highest_class) == 1:
                    record_to_add = dup_rows.loc[[highest_class[0]], :].copy()
                elif len(most_complete) == 1:
                    record_to_add = dup_rows.loc[[most_complete[0]], :].copy()
                else:
                    rejects = pd.concat([rejects, dup_rows],
                                        ignore_index=True,
                                        sort=False)
                    record_to_add = None
            elif same_country:
                rejects = pd.concat([rejects, dup_rows],
                                    ignore_index=True,
                                    sort=False)
                record_to_add = None
            else:
                record_to_add = dup_rows
            if record_to_add is not None:
                unique_rows = pd.concat([unique_rows, record_to_add],
                                        ignore_index=True,
                                        sort=False)

        # save rejects to file
        print("Saving rejects")
        rejects = rejects.astype(str)
        rejects.replace(str(fmiss), pd.np.nan, inplace=True)
        rejects.replace(str(imiss), pd.np.nan, inplace=True)
        rejects.to_csv(outputpath + './split/' + os.path.basename(input_file) +
                       '.' + 'reject',
                       index=False,
                       sep='|',
                       na_rep='NULL')
        datain = unique_rows.copy()

        # get list of countries present in file
        countries = datain.rcnty.unique()
        print(countries, file=log)

        # now loop over countries homogenising
        for country in countries:
            if verbose > 0:
                print("Processing {}".format(country), file=log)
            tmp_data = datain.loc[datain.rcnty == country].copy()

            tmp_data = tmp_data.reindex()
            nrows = tmp_data.shape[0]

            # output file for data from this country
            country_file = os.path.basename(input_file) + '.' + country

            cor = None
            # check if corrections exists for country / edition
            for cor_temp in corrections:
                if cor_temp['file'] == country_file:
                    cor = cor_temp
                    break

            # validate (and correct) data
            for column in tmp_data:
                # ++++++++++ CORRECT DATA ++++++++++
                # check if correction required and apply
                if cor is not None:
                    for f in cor['corrections']:
                        if f['field'] == column:
                            if f['all'] == 1:
                                if verbose > 0:
                                    print(
                                        "Applying corrections to all values in {}"
                                        .format(column),
                                        file=log)
                                    print("Factor = {}".format(f['factor']),
                                          file=log)
                                # getting non missing rows
                                # valid = tmp_data[column] != fmiss
                                valid = tmp_data[column].apply(
                                    lambda x: abs(x - fmiss) < tol)
                                # apply to tmp data
                                tmp_data.at[valid, column] = tmp_data.loc[
                                    valid, column] * f['factor']
                                # apply to datain
                                datain.at[ (datain['rcnty'] == country) & (valid) , column] = \
                                                datain.loc[ (datain['rcnty'] == country) & (valid) , column] * f['factor']
                            else:
                                valid = pv.validate_numeric(
                                    tmp_data[column],
                                    min_value=schema.column_valid_min[column],
                                    max_value=schema.column_valid_max[column],
                                    return_type='mask_series')
                                valid = valid & ~(tmp_data[column].apply(
                                    lambda x: abs(x - fmiss) < tol))
                                if any(valid):
                                    if verbose > 0:
                                        print(
                                            "Applying corrections to invalid values in {}"
                                            .format(column),
                                            file=log)
                                        print("Factor = {}".format(
                                            f['factor']),
                                              file=log)
                                    # apply to tmp data
                                    tmp_data.at[valid, column] = tmp_data.loc[
                                        valid, column] * f['factor']
                                    # now apply to datain
                                    valid = pv.validate_numeric(
                                        datain[column],
                                        min_value=schema.
                                        column_valid_min[column],
                                        max_value=schema.
                                        column_valid_max[column],
                                        return_type='mask_series')
                                    datain.at[ (datain['rcnty'] == country) & (valid), column] = \
                                                    datain.loc[ (datain['rcnty'] == country) & (valid), column] * f['factor']
                # ++++++++++ VALIDATE CODED DATA ++++++++++
                # get code table to validate against
                tableID = schema.column_code_table[column]
                if tableID in schema.code_tables:
                    codes = schema.code_tables[tableID]
                    if verbose > 1:
                        print("Validating against code table: " + str(tableID),
                              file=log)
                    whitelist = codes['code'].map(str)
                    whitelist = whitelist.append(
                        pd.Series([cmiss, '-1', 'NA', '-999999']))
                    tmp_values = tmp_data[column].map(str)
                    valid = pv.validate_string(tmp_values,
                                               whitelist=whitelist,
                                               return_type='mask_series')

                # ++++++++++ VALIDATE NUMERIC ++++++++++
                if tableID == None:
                    if schema.column_type[column] != 'object':
                        # if int convert to float and replace -1 with np.na
                        if str(tmp_data[column].dtype) == 'int64':
                            tmp_values = pd.to_numeric(tmp_data[column])
                            tmp_values = tmp_values.replace(
                                to_replace=imiss, value=fmiss)  # pd.np.nan )
                        else:
                            tmp_values = tmp_data[column]
                        valid = pv.validate_numeric(
                            tmp_data[column],
                            min_value=schema.column_valid_min[column],
                            max_value=schema.column_valid_max[column],
                            return_type='mask_series')
                        valid = valid & ~(tmp_data[column].apply(
                            lambda x: abs(x - fmiss) < tol))
                    else:
                        valid = pd.Series(False * nrows)

                # calculate fraction bad
                fraction_bad = sum(valid) / nrows
                if (fraction_bad > 0.05) & (nrows > 10):
                    mask = valid.apply(lambda x: not x)
                    print("///////////// " + os.path.basename(input_file) +
                          '.' + country + " /////////////",
                          file=log)
                    print("Large number of bad values for " + column + "(" +
                          str(tableID) + ")",
                          file=log)
                    print(tmp_data.loc[valid, column].unique(), file=log)
                elif any(valid):
                    print("Bad values, {} ({})  :: {}".format(
                        column, str(tableID), tmp_values[valid].unique()),
                          file=log)

            dataout = datain[datain.rcnty == country]
            dataout = dataout.assign(valid_from=valid_from)
            dataout = dataout.assign(valid_to=valid_to)
            dataout = dataout.assign(schema=schema.version)
            # convert all columns to object and replace fmiss and imiss with NA
            dataout = dataout.astype(str)
            dataout.replace(str(fmiss), pd.np.nan, inplace=True)
            dataout.replace(str(imiss), pd.np.nan, inplace=True)
            dataout.to_csv(outputpath + './split/' +
                           os.path.basename(input_file) + '.' + country,
                           index=False,
                           sep='|',
                           na_rep='NULL')