Example #1
0
    def test_la_data(self):
        ons = pd.read_csv("tests/data/las_ons.csv")
        os = pd.read_csv("tests/data/las_os.csv")

        start = timer()
        df_joined = fuzzy_left_join(ons, os, left_on = ["lad16nm"], right_on = ["name"])
        end = timer()
        time_taken =  end - start

        rename = {"lad16cd": "ons_code", "code": "os_code", "lad16nm": "ons_name", "name": "os_name"}
        df_joined = df_joined.rename(columns=rename)
        col_order = ["best_match_score", "ons_name", "os_name", "ons_code", "os_code"]

        num_records = len(df_joined)
        correct_binary = (df_joined["ons_code"] == df_joined["os_code"])
        perc_correct = correct_binary.sum()/num_records

        this_record = {}
        this_record["datetime"] = datetime.datetime.now().isoformat()
        this_record["commit_hash"] = get_commit_hash()
        this_record["perc_correct"] = perc_correct
        this_record["test_type"] = "local_authority"
        this_record["time_taken"] = time_taken

        with open("tests/realexample_performance.txt", "a") as myfile:
            myfile.writelines(json.dumps(this_record) + "\n")
def fun_fuzzymatcher(left_data, right_data, left_on, right_on, dropyn):
    match = fuzzymatcher.fuzzy_left_join(left_data, right_data, left_on,
                                         right_on)
    if dropyn == "drop_yes":
        match.drop(["best_match_score", "__id_left", "__id_right"],
                   axis=1,
                   inplace=True)
    return match
Example #3
0
def join_on_name_event():
    df1 = pd.read_csv(tapology_output)
    df2 = pd.read_csv(ufcstats_output)

    df1['Concat'] = df1['Event'] + " " + df1['Name']
    df2['Concat'] = df2['Event'] + " " + df2['Name']
    df = fuzzy_left_join(df1, df2, ['Concat'], ['Concat'])
    df.to_csv(final_output, index=False)
Example #4
0
def merging_fuz(left_col, right_col, df1, df2):
    res = fuzzy_left_join(df2.dropna(subset=right_col),
                          df1.dropna(subset=left_col),
                          left_on=right_col,
                          right_on=left_col)
    # res = res.drop_duplicates(subset=['prospect_id', 'user_dwh_id'], keep='first')
    # res = res[cols_keep]
    return res
Example #5
0
def match_by_county(df_addr, df_eircode, county):

    '''
    split left and right dfs by county for quicker, more accurate matches
    '''

    df_addr = df_addr[df_addr['County'] == county]
    df_eircode = df_eircode[df_eircode['county'] == county]
    
    matched_df = fuzzymatcher.fuzzy_left_join(df_addr, df_eircode, left_on = "Town", right_on = "name")

    return matched_df
Example #6
0
def main():

    # START

    print(str(dt.datetime.now()) + '  STARTED')

    df_ppr = import_ppr(dir_input + 'PPR-ALL.csv')

    df_ppr.to_csv(dir_output + 'ppr-' +
                  format(dt.datetime.now().strftime("%Y%m%d-%H%M")) + '.csv',
                  index=False)

    print(str(dt.datetime.now()) + '  LOADED PPR DATA')

    # LOAD EIRCODE DATE

    print(str(dt.datetime.now()) + '  STARTED TO LOAD EIRCODE DATA')

    df_eirc = import_eircodes(dir_input + 'ie-towns.csv')
    df_eirc.to_csv(dir_output + 'eircode-' +
                   format(dt.datetime.now().strftime("%Y%m%d-%H%M")) + '.csv',
                   index=False)

    try:
        df1 = fuzzymatcher.fuzzy_left_join(df_ppr,
                                           df_eirc,
                                           left_on="lookup",
                                           right_on="lookup")
    except Exception as e:
        print(str(dt.datetime.now()) + '  IT STOPPED AGAIN')
        print(str(e))
        pass

    print(str(dt.datetime.now()) + '  EIRCODE LOOKUP COMPLETED')

    try:
        now = dt.datetime.now()
        df1.to_csv(dir_output + 'output_combined-' +
                   format(now.strftime("%Y%m%d-%H%M")) + '.csv',
                   index=False)
    except Exception as e:
        print('close the file')
        print(str(e))

    print(str(dt.datetime.now()) + '  FINISHED AND OUTPUTTED CSV FILE')
Example #7
0
    def transform(self):
        self.ts.notes = self.ts.notes.apply(
            lambda x: self.remove_punctuation(self.remove_digits(x.lower())))
        self.notes_id_dict = self.split_notes_id()
        self.notes_dict = {}
        self.ordered_dict = dict()
        for tag in self.notes_id_dict:
            self.notes_dict[tag] = {}
            for typ in self.notes_id_dict[tag]:
                curr_id = self.notes_id_dict[tag][typ]

                try:
                    curr_patt = pd.read_csv(f'./{tag}_{typ}_patterns.csv')
                    curr_patt.rename({'0': 'patterns'}, axis=1, inplace=True)
                except:
                    continue

                self.notes_dict[tag][typ] = self.ts.notes[curr_id].to_frame()
                self.notes_dict[tag][typ]
                try:
                    fm = fuzzymatcher.fuzzy_left_join(
                        self.notes_dict[tag][typ],
                        curr_patt,
                        left_on='notes',
                        right_on='patterns')[['notes', 'patterns']]
                except:
                    continue
                fm.index = curr_id
                fm.dropna(inplace=True)
                #display(fm)
                fm = fm.applymap(lambda x: x.split(' '))
                fm['entities'] = self.extract_entities(fm, 'notes', 'patterns')
                self.notes_dict[tag][typ] = fm['entities'].apply(
                    lambda x:
                    [' '.join(x) for x in self.get_upto_ngrams(x, 2)])
                self.ordered_dict.update(self.notes_dict[tag][typ])
        return self.ordered_dict
Example #8
0
File: app.py Project: gopidon/flask
def fuzzy():
    print("Fuzzy Called!")
    data_dir = os.getcwd() + '/data/'
    now = datetime.datetime.now()
    matchedFileName = 'matched' + str(now) + '.xlsx'
    params = request.json
    print('Params:', params)
    apisFile = params['apisFile']
    formCFile = params['formCFile']
    otherCompareList = params['otherCompareList']
    left_on = list(otherCompareList)
    right_on = list(otherCompareList)
    cust_index = -1
    try:
        cust_index = right_on.index("customer_name")
    except:
        cust_index = -1
    if cust_index != -1:
        right_on[cust_index] = "passenger_name"
    print("left_on:", left_on)
    print("right_on:", right_on)
    print("1.Reading Duty Free Data ...............................")
    dfree = pd.read_excel(formCFile)
    if 'passport_number' in dfree.columns:
        dfree['passport_number'] = dfree['passport_number'].astype(str)
        dfree['passport_number'] = dfree['passport_number'].str.strip()
    else:
        return jsonify({
            'error':
            True,
            'errorMessage':
            "Error processing: passport_number column not found in the uploaded Form C Excel file"
        })
    if 'customer_name' in dfree.columns:
        dfree['customer_name'] = dfree['customer_name'].astype(str)
        dfree['customer_name'] = dfree['customer_name'].str.strip()
    else:
        return jsonify({
            'error':
            True,
            'errorMessage':
            "Error processing: customer_name column not found in the uploaded Form C Excel file"
        })
    if 'flight_number' in dfree.columns:
        dfree['flight_number'] = dfree['flight_number'].astype(str)
        dfree['flight_number'] = dfree['flight_number'].str.strip()
    else:
        return jsonify({
            'error':
            True,
            'errorMessage':
            "Error processing: flight_number column not found in the uploaded Form C Excel file"
        })
    dfree.drop_duplicates()
    print("2.Finished Reading Duty Free Data ...............................")
    print(dfree.info())
    print(
        "3.Now reading Flights Data. This might take a while ..............................."
    )
    arrivals = pd.read_excel(apisFile)
    if 'passport_number' in arrivals.columns:
        arrivals['passport_number'] = arrivals['passport_number'].astype(str)
        arrivals['passport_number'] = arrivals['passport_number'].str.strip()
    else:
        return jsonify({
            'error':
            True,
            'errorMessage':
            "Error processing: passport_number column not found in the uploaded APIS Excel file"
        })
    if 'passenger_name' in arrivals.columns:
        arrivals['passenger_name'] = arrivals['passenger_name'].astype(str)
        arrivals['passenger_name'] = arrivals['passenger_name'].str.strip()
    else:
        return jsonify({
            'error':
            True,
            'errorMessage':
            "Error processing: passenger_name column not found in the uploaded APIS Excel file"
        })
    if 'flight_number' in arrivals.columns:
        arrivals['flight_number'] = arrivals['flight_number'].astype(str)
        arrivals['flight_number'] = arrivals['flight_number'].str.strip()
    else:
        return jsonify({
            'error':
            True,
            'errorMessage':
            "Error processing: flight_number column not found in the uploaded APIS Excel file"
        })
    arrivals.fillna("Not Available", inplace=True)
    arrivals.drop_duplicates()

    print("4.Finished reading Flights Data ...............................")
    print(arrivals.info())
    print("5. Applying match algorithm....")
    matched = fuzzymatcher.fuzzy_left_join(dfree, arrivals, left_on, right_on)
    matched.to_excel(data_dir + matchedFileName)
    print("6.matched.xlsx is ready ...............................")
    return jsonify({
        'fuzzyMatched': matched.head(1000).to_json(orient='records'),
        'fuzzyMatchedFileName': matchedFileName,
        'error': False
    })
Example #9
0
def test_fuzzy_left_join():

    pd.set_option('display.max_columns', 4)
    '''
	pd.set_option('display.max_rows', None)
	pd.set_option('display.width', None)
	pd.set_option('display.max_colwidth', None)
	pd.set_option('display.max_rows', None)
	pd.set_option('display.max_columns', None)
	'''

    ons = pd.read_csv(
        "./data/restaurants/grouped_by_restaurant/ActiveDiner.txt",
        sep='\t',
        names=["Fonte", "Ristorante", "Indirizzo"],
        converters={
            'Fonte': strip,
            'Ristorante': strip,
            'Indirizzo': strip
        })

    os = pd.read_csv(
        "./data/restaurants/grouped_by_restaurant/DiningGuide.txt",
        sep='\t',
        names=["Fonte2", "Ristorante2", "Indirizzo2"],
        converters={
            'Fonte2': strip,
            'Ristorante2': strip,
            'Indirizzo2': strip
        })

    # Columns to match on from df_left
    left_on = ["Ristorante", "Indirizzo"]

    # Columns to match on from df_right
    right_on = ["Ristorante2", "Indirizzo2"]

    df_joined = fuzzymatcher.fuzzy_left_join(ons,
                                             os,
                                             left_on=left_on,
                                             right_on=right_on)
    rename = {"best_match_score": "Score"}
    df_joined = df_joined.rename(columns=rename)
    df_joined = df_joined.sort_values("Score", ascending=False)
    df_joined.to_csv(
        './data/restaurants_integrated/output_fuzzyMatcher/results.csv',
        header=True,
        sep=";",
        decimal=',',
        float_format='%.3f')

    col_order = [
        "Score", "Ristorante", "Ristorante2", "Indirizzo", "Indirizzo2"
    ]
    print(df_joined[col_order].sample(10))

    num_records = len(df_joined)
    correct_binary = (df_joined["Ristorante"] == df_joined["Ristorante2"])
    perc_correct = correct_binary.sum() / num_records

    print("The percentage of name restaurants correctly matched was {:,.1f}%".
          format(perc_correct * 100))
Example #10
0
def addcols_joindata(excelpathin, jsonpathin, excelpathout):
    '''
    The function takes input Excel file substractes relevant columns -'Structure',	'Name',	'Formula'. From the 'Structure'
    it create another  3 columns - inchikey, source_id , source_name. Then, take the JSON file which is
    the parsed data from the HMDB and merge the modified EXCEL with JSON, first by inchikey then by name and finely
    by Chemical Formula. This function takes the addinchikey and joindata functions  and merge them to one.
    :param excelpathin: The path to Excel file which is the output of the Compound Discoverer. in the format r'path' -
    r'D:/BCDD/Documents/TalCompounds_export_test.xlsx"
    :param jsonpathin: Path to JSON file - parsed XML file from HMDB
    :param excelpathout: Path to output Excel after the merge.
    :return: The columns of the Excel file with added columns  (disease name) from the JSON
    '''

    start_time = time.time()
    CD = pd.read_excel(excelpathin)
    # CD = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider Results.xlsx')
    # CD = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Compounds_export_test.xlsx')
    CD = pd.DataFrame(CD[1000:2001], columns=['Structure', 'Name', 'Formula'])
    sdflist = CD.Structure

    # Loop over all cells in Structure is Nan value enter the string 'Nan'
    # adding delay time so we want be blocked
    newlistinchikey = []
    newlistsource_id = []
    newlistsource_name = []
    for idx, sdf in enumerate(sdflist):
        print(idx)

        if idx % 50 == 0:
            print("--- %s seconds --f-time to %s rows" %
                  ((time.time() - start_time), idx))

            time.sleep(3.25)

        if pd.isnull(sdf):
            # print(idx)
            newlistinchikey.append(np.nan)
            newlistsource_id.append(np.nan)
            newlistsource_name.append(np.nan)
        else:
            comp = pcp.get_compounds(sdf, 'sdf')

            # In case the comp[0]=Compound() than type(comp[0].cid) is <class 'NoneType'>
            if type(comp[0].cid) == type(None):
                substance = []
                newlistinchikey.append(np.nan)

            else:
                substance = pcp.get_substances(comp[0].cid, 'sid')
                # print(comp)
                # print(substance)
                # comp[0].inchikey
                newlistinchikey.append(comp[0].inchikey)

            # The if statement is in case substance= [] (empty) -> then len(substance)=0
            if len(substance) > 0:
                newlistsource_name.append(substance[0].source_name)
                newlistsource_id.append(substance[0].source_id)
            else:
                newlistsource_name.append(np.nan)
                newlistsource_id.append(np.nan)

    # Change list to Dataframe and concatenate with the original data and name them
    newlistinchikey = pd.DataFrame(newlistinchikey)
    newlistinchikey.columns = ['InChIKey']
    newlistsource_name = pd.DataFrame(newlistsource_name)
    newlistsource_name.columns = ['source_name']
    newlistsource_id = pd.DataFrame(newlistsource_id)
    newlistsource_id.columns = ['source_id']

    CD = pd.concat([CD, newlistinchikey, newlistsource_name, newlistsource_id],
                   axis=1,
                   sort=False)
    print("--- %s seconds --f-add 3 cols" % (time.time() - start_time))

    # From here is the joindata function with modification
    # Load the parse HMDB file
    with open(jsonpathin, 'r') as read_file:
        data = json.load(read_file)

    start_time = time.time()
    # Load the parse HMDB file
    # with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Parser_HMDB.py Output/serum_metabolites.json', 'r') as read_file:
    #     data = json.load(read_file)

    # Create a data frame from the list of dictionaries
    # df_hmdb = pd.DataFrame(data,  columns=['accession', 'name', 'chemical_formula', 'inchikey', 'disease_name' ])
    df_hmdb = pd.DataFrame(data)
    df_hmdb.drop(
        ['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'],
        axis=1)

    df_excel = CD
    # Merge by inchikey
    joindata_by_inchikey = pd.merge(left=df_excel,
                                    right=df_hmdb,
                                    how='inner',
                                    left_on='InChIKey',
                                    right_on='inchikey')

    print("--- %s seconds --f-merge by inchikey " % (time.time() - start_time))

    start_time = time.time()
    # Reduce the rows to those we DID find a match by inchkey in bothe data sets
    df_hmdb_reduce_byinchik = df_hmdb.loc[~df_hmdb['inchikey'].
                                          isin(df_excel['InChIKey'])]
    df_excel_reduce_byinchik = df_excel.loc[
        ~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])]

    # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
    joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik,
                                                    df_hmdb_reduce_byinchik,
                                                    left_on="Name",
                                                    right_on="name")

    # Selecting threshold  best_match_score>0.25 maybe adjustments needed
    joindata_by_name = joindata_by_name[
        joindata_by_name['best_match_score'] > 0.55]
    # Drop columns the
    joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'],
                          axis=1,
                          inplace=True)
    print("--- %s seconds --f-merge by name" % (time.time() - start_time))

    start_time = time.time()
    # Reduce the rows to those we DID find a match by inchkey in and by name both data sets
    df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[
        ~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])]
    df_excel_reduce_byname = df_excel_reduce_byinchik.loc[
        ~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])]
    # Remove spaces between letters on  'Formula' ( there is  a warning)
    df_excel_reduce_byname.loc[:, 'Formula'] = df_excel_reduce_byname[
        'Formula'].str.replace(' ', '')

    # Merge by chemical_formula
    joindata_by_CF = pd.merge(left=df_excel_reduce_byname,
                              right=df_hmdb_reduce_byname,
                              how='inner',
                              left_on='Formula',
                              right_on='chemical_formula')

    # This data inculed rows from the original EXCEL file that we did NOT find and match ( by inchikey nor name nor CF)
    df_excel_reduce_byCF = df_excel_reduce_byname.loc[
        ~df_excel_reduce_byname['Formula'].
        isin(joindata_by_CF['chemical_formula'])]

    # Create a list of all columns of the HMDB JSON data
    colnames = joindata_by_inchikey.columns[6:]
    # Add those names as empty columns to the df_excel_reduce_byCF. reducedata in all the rows from the original Excel
    # that did NOT find a match and added the columns of the HMDB
    reducedata = df_excel_reduce_byCF.reindex(
        columns=[*df_excel_reduce_byCF.columns.tolist(), *colnames])

    # Append all the data sets
    # out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF))
    out = joindata_by_inchikey.append(
        joindata_by_name.append(joindata_by_CF.append(reducedata)))

    print("--- %s seconds --f-merge by CF" % (time.time() - start_time))
    # Export the merge data to an Excel file
    writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter')
    # writer = pd.ExcelWriter('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider ResultsW HMDB_0_1000.xlsx', engine='xlsxwriter')
    out.to_excel(writer, header=True)
    writer.save()
    writer.close()

    return (out)
Example #11
0
# In[6]:

# Replace nulls in SJR with zeroes, this is what the Scimagojr website suggests.
# Eg: https://www.scimagojr.com/journalsearch.php?q=21100817106&tip=sid&clean=0
j_df.loc[j_df['SJR'].isnull(), 'SJR'] = 0

# In[7]:

PICKLE_PATH = 'dblp.pickle'
try:
    dblp_df = pd.read_pickle(PICKLE_PATH)
except:
    dblp_df0 = pd.read_json('../input/dblp-ref-0.json', lines=True)
    dblp_df1 = pd.read_json('../input/dblp-ref-1.json', lines=True)
    dblp_df2 = pd.read_json('../input/dblp-ref-2.json', lines=True)
    dblp_df3 = pd.read_json('../input/dblp-ref-3.json', lines=True)
    dblp_df = pd.concat([dblp_df0, dblp_df1, dblp_df2, dblp_df3])
    dblp_df.to_pickle(PICKLE_PATH)

# In[11]:

# # Without a fuzzy join we only matched 622702 out of 3079007 rows. Now we consider a fuzzy join.

t = fuzzymatcher.fuzzy_left_join(dblp_df[3000000:], j_df, ['venue'], ['Title'])
print('done')
PICKLE_PATH = 'dblp_jr11.pickle'
t.to_pickle(PICKLE_PATH)

# In[ ]:
Example #12
0
	'Antarctica' : '7'
	}

for lang in languages:

	link = 'https://publications.europa.eu/code/' + lang + '/' + lang + '-5000500.htm'
	df = pd.read_html(link, header=0)[1]
	
	if lang == 'fr':
		df = df.iloc[:,[1,4,5,6,7,8]]
	else:
		df = df.iloc[:,[1,3,4,5,6,7]]
	
	df.columns = languages[lang]
	
	for (columnName, columnData) in df.iteritems():
		df[columnName] = df[columnName].str.replace(r"\([^()]*\)","").str.strip()
	
	coden = languages[lang][1]
	df = df[df[coden].str.len() == 2]
	
	df_wc = fuzzymatcher.fuzzy_left_join(df, df_cont, left_on = coden, right_on = 'CountryCode')
	
	df_wc['Colour'] = df_wc['Continent'].map(colours)
	
	df_wc.drop(df_wc.columns[[0, 1, 2, 10]], axis = 1, inplace = True) 
	
	df_wc.to_csv('countries_' + lang + '.csv', sep=',', index=False)
	

Example #13
0
def joindata(jsonpathin, excelpathin, excelpathout):
    '''
    The function takes 2 files and merge them by columns
    :param jsonpathin: path to JSON file parsed XML file from HMDB
    :param excelpathin: path to Excel file the output of LC-MS - Should be of the form r'path'
    :param excelpathout: path to output Excel after the merge.
    :return: The columns of the Excel file with added column  (diseas name) from the JSON
    '''

    # Load the parse HMDB file
    with open(jsonpathin, 'r') as read_file:
        data = json.load(read_file)

    # create a data frame from the list of dictionaries
    # df_hmdb = pd.DataFrame(data,  columns=['accession', 'name', 'chemical_formula', 'inchikey', 'disease_name' ])
    df_hmdb = pd.DataFrame(data)
    df_hmdb.drop(['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'], axis=1)
    # Load the Excel file -
    # df_excel = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/differential_metabolites.xlsx', sheet_name='UP')
    # df_excel = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/differential_metabolites.xlsx', sheet_name='DOWN')

    # load the Excel file
    df_excel = pd.read_excel(excelpathin)

    # df_excel = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/CD_10metabolites.xlsx')

    # merge by inchikey
    joindata_by_inchikey = pd.merge(left=df_excel, right=df_hmdb, how='inner', left_on='InChIKey', right_on='inchikey')

    # reduce the rows to those we DID find a match by inchkey in bothe data sets
    df_hmdb_reduce_byinchik= df_hmdb.loc[~df_hmdb['inchikey'].isin(df_excel['InChIKey'])]
    df_excel_reduce_byinchik = df_excel.loc[~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])]


    start_time = time.time()
    # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
    joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik, df_hmdb_reduce_byinchik, left_on="Name", right_on="name")

    # selecting threshold  best_match_score>0.25 maybe adjustments needed
    joindata_by_name = joindata_by_name[joindata_by_name['best_match_score'] > 0.25]
    # Drop columns the
    joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'], axis=1, inplace=True)
    print("--- %s seconds --f-" % (time.time() - start_time))

    # reduce the rows to those we DID find a match by inchkey in and by name both data sets
    df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])]
    df_excel_reduce_byname = df_excel_reduce_byinchik.loc[~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])]
    # remove spaces between letters on  'Formula' ( there is  a warning)
    df_excel_reduce_byname.loc[:, 'Formula'] = df_excel_reduce_byname['Formula'].str.replace(' ', '')

    joindata_by_CF = pd.merge(left=df_excel_reduce_byname, right=df_hmdb_reduce_byname, how='inner', left_on='Formula', right_on='chemical_formula')

    #  append to 2 merge datasets to 1
    joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF))
    # return joindata_by_name

    # with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/differential_metabolites.xlsx', 'w') as fout:
    #     json.dump(saliva_metabolites, fout, indent=4)

    # writer = pd.ExcelWriter('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/test_larger_listW disease name.xlsx',
    #                         engine='xlsxwriter')

    # Export the merge data to an Excel file
    writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter')

    joindata_by_inchikey.to_excel(writer,  header=True)
    writer.save()
    writer.close()
Example #14
0
#!/usr/bin/env python3

import pandas as pd

import os
cwd = os.getcwd()

link = "https://lb.wikipedia.org/wiki/L%C3%ABscht_vun_de_Staate_vun_der_Welt"
df_lu = pd.read_html(link, header=0)[0]

df_lu = df_lu[['Land', 'Haaptstad']]

df_lu = df_lu.rename(columns={'Land':'CountryName_LU'})
df_lu = df_lu.rename(columns={'Haaptstad':'CapitalName_LU'})

import fuzzymatcher

world_data = pd.read_csv(cwd + '/countries_lu.txt', na_filter = False)

df_all = fuzzymatcher.fuzzy_left_join(world_data, df_lu, left_on = 'Land', right_on = 'CountryName_LU')

df_all.to_csv("countries_lu_text.csv", sep=',', index=False)

df_all = df_all[['Land','CountryName_LU','Landcode','Haaptstad', 'CapitalName_LU', 'Numm vun der Persoun', 'Adjektiv', 'Währung', 'Kontinent', 'Faarf']]

df_all.to_csv("countries_lu.csv", sep=',', index=False)
df_nuts['Region'] = df_nuts['Region'].str.replace("Valencian Community",
                                                  "Valencia")
df_nuts['Region'] = df_nuts['Region'].str.replace("Region of Murcia", "Murcia")

df_nuts = df_nuts.drop_duplicates()

# Merge

#df_es = df_es.merge(df_nuts, on=['Region'], how='right')

# Merge

import fuzzymatcher

df_es = fuzzymatcher.fuzzy_left_join(df_nuts,
                                     df_es,
                                     left_on="Region",
                                     right_on="Region")
df_es = df_es.rename(columns={'Region_right': 'Region'})
df_es = df_es[['Code.1', 'Region', 'Deaths']]

# PORTUGAL

# Corona

link = "https://pt.wikipedia.org/wiki/Pandemia_de_COVID-19_em_Portugal"
df_pt = pd.read_html(link, header=0)[9]
#df_pt.drop(df_pt.tail(2).index,inplace=True)
df_pt = df_pt.tail(4)
df_pt = df_pt.reset_index()

df_pt = df_pt.transpose()
Example #16
0
def join_on_name():
    df1 = pd.read_csv(tapology_output)
    df2 = pd.read_csv(ufcstats_output)

    df = fuzzy_left_join(df1, df2, ['Name'], ['Name'])
    df.to_csv(final_output, index=False)
Example #17
0
# data import
#df1 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',header=2)
df1 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',
                    header=2,
                    sheet_name='SpringAlmaOutput')
#df1 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',header=2,sheet_name='FallAlmaOutput')
df1.head()
# %%
#df2 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',sheet_name='CSApprovedAdoptionList')
df2 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',
                    sheet_name='SpringBookstoreList')
#df2 = pd.read_excel('Textbook projectSummerFall2021-2.xlsx',sheet_name='FallBookstoreList')
df2.head()
#%%
#method 1
matched_results = fuzzymatcher.fuzzy_left_join(df1, df2, 'Title', 'Long Title')

#%%
matched_results[['best_match_score', 'Title', 'Long Title', 'Internal ID'
                 ]].to_excel('fuzzymatcherresults_min_spring.xlsx')
matched_results.to_excel('fuzzymatcherresults_full_spring.xlsx')
# %%
#method 2
import recordlinkage as rl
from recordlinkage.index import Full
# %%
indexer = rl.Index()
indexer.add(Full())

pairs = indexer.index(
    df1,
def addcols_joindata(excelpathin, jsonpathin, excelpathout):
    '''
    The function takes input Excel file substractes relevant columns -'Structure',	'Name',	'Formula'. From the 'Structure'
    it create another  3 columns - inchikey, source_id , source_name. Then, take the JSON file which is
    the parsed data from the HMDB and merge the modified EXCEL with JSON, first by inchikey then by name and finely
    by Chemical Formula. This function takes the addinchikey and joindata functions  and merge them to one.
    :param excelpathin: The path to Excel file which is the output of the Compound Discoverer. in the format r'path' -
    r'D:/BCDD/Documents/TalCompounds_export_test.xlsx"
    :param jsonpathin: Path to JSON file - parsed XML file from HMDB
    :param excelpathout: Path to output Excel after the merge.
    :return: The columns of the Excel file with added columns  (disease name) from the JSON
    '''

    start_time = time.time()
    #     CD = pd.read_excel(excelpathin)

    # data = pd.ExcelFile(r"C:\Users\USER\Downloads\MOD_REINJ_NEG_ChemSpider Results.xlsx")
    data = pd.ExcelFile(excelpathin)

    df = data.parse(sheet_name=0)
    inKey = list()
    for idx, sd in enumerate(df['Structure']):
        print(idx)
        F = open("temp.sdf", "w")
        F.writelines(sd)
        F.close()
        suppl = Chem.SDMolSupplier('temp.sdf')
        mol = next(suppl)
        if mol == None:
            inKey.append(np.nan)
        else:
            inKey.append(Chem.MolToInchiKey(mol))

    inKey = pd.DataFrame(inKey)
    inKey.columns = ['InChIKey']
    CD = pd.concat([df, inKey], axis=1, sort=False)

    print("--- %s seconds --f-add 3 cols" % (time.time() - start_time))

    # From here is the joindata function with modification
    # Load the parse HMDB file
    with open(jsonpathin, 'r') as read_file:
        data = json.load(read_file)

    start_time = time.time()
    # Load the parse HMDB file
    # with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Parser_HMDB.py Output/serum_metabolites.json', 'r') as read_file:
    #     data = json.load(read_file)

    # Create a data frame from the list of dictionaries
    # df_hmdb = pd.DataFrame(data,  columns=['accession', 'name', 'chemical_formula', 'inchikey', 'disease_name' ])
    df_hmdb = pd.DataFrame(data)
    df_hmdb.drop(
        ['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'],
        axis=1)

    df_excel = CD
    # Merge by inchikey
    joindata_by_inchikey = pd.merge(left=df_excel,
                                    right=df_hmdb,
                                    how='inner',
                                    left_on='InChIKey',
                                    right_on='inchikey')

    print("--- %s seconds --f-merge by inchikey " % (time.time() - start_time))

    start_time = time.time()
    # Reduce the rows to those we DID find a match by inchkey in bothe data sets
    df_hmdb_reduce_byinchik = df_hmdb.loc[~df_hmdb['inchikey'].
                                          isin(df_excel['InChIKey'])]
    df_excel_reduce_byinchik = df_excel.loc[
        ~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])]

    # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
    joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik,
                                                    df_hmdb_reduce_byinchik,
                                                    left_on="Name",
                                                    right_on="name")

    # Selecting threshold  best_match_score>0.25 maybe adjustments needed
    joindata_by_name = joindata_by_name[
        joindata_by_name['best_match_score'] > 0.55]
    # Drop columns the
    joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'],
                          axis=1,
                          inplace=True)
    print("--- %s seconds --f-merge by name" % (time.time() - start_time))

    start_time = time.time()
    # Reduce the rows to those we DID find a match by inchkey in and by name both data sets
    df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[
        ~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])]
    df_excel_reduce_byname = df_excel_reduce_byinchik.loc[
        ~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])]
    # Remove spaces between letters on  'Formula' ( there is  a warning)
    df_excel_reduce_byname.loc[:, 'Formula'] = df_excel_reduce_byname[
        'Formula'].str.replace(' ', '')

    # Merge by chemical_formula
    joindata_by_CF = pd.merge(left=df_excel_reduce_byname,
                              right=df_hmdb_reduce_byname,
                              how='inner',
                              left_on='Formula',
                              right_on='chemical_formula')

    # This data inculed rows from the original EXCEL file that we did NOT find and match ( by inchikey nor name nor CF)
    df_excel_reduce_byCF = df_excel_reduce_byname.loc[
        ~df_excel_reduce_byname['Formula'].
        isin(joindata_by_CF['chemical_formula'])]

    # Create a list of all columns of the HMDB JSON data
    colnames = joindata_by_inchikey.columns[6:]
    # Add those names as empty columns to the df_excel_reduce_byCF. reducedata in all the rows from the original Excel
    # that did NOT find a match and added the columns of the HMDB
    reducedata = df_excel_reduce_byCF.reindex(
        columns=[*df_excel_reduce_byCF.columns.tolist(), *colnames])

    # Append all the data sets
    # out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF))
    out = joindata_by_inchikey.append(
        joindata_by_name.append(joindata_by_CF.append(reducedata)))

    print("--- %s seconds --f-merge by CF" % (time.time() - start_time))
    # Export the merge data to an Excel file
    writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter')
    # writer = pd.ExcelWriter('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider ResultsW HMDB_0_1000.xlsx', engine='xlsxwriter')
    out.to_excel(writer, header=True)
    writer.save()
    writer.close()

    return (out)
Example #19
0
)
hospital_reimbursement = pd.read_csv(
    'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv'
)
# print(hospital_accounts.head())
# Columns to match on from df_left
left_on = ["Facility Name", "Address", "City", "State"]

# Columns to match on from df_right
right_on = [
    "Provider Name", "Provider Street Address", "Provider City",
    "Provider State"
]
# Now perform the match
# It will take several minutes to run on this data set
matched_results = fuzzymatcher.fuzzy_left_join(hospital_accounts,
                                               hospital_reimbursement,
                                               left_on,
                                               right_on,
                                               left_id_col='Account_Num',
                                               right_id_col='Provider_Num')
# Reorder the columns to make viewing easier
cols = [
    "best_match_score", "Facility Name", "Provider Name", "Address",
    "Provider Street Address", "Provider City", "City", "Provider State",
    "State"
]
# Look at the matches around 1
print(matched_results[cols].sort_values(by=['best_match_score'],
                                        ascending=False).head(5))
def fuzzy_city_merge(enpop, worldcities):
    '''Meerge the every noise data with the worldcities data exact and then fuzzy matching, returns merged and leftover cities'''
    
    # create a dataset with precise data for specific cities 
    enpop_cities = pd.DataFrame()               
    
    # beginning exact matching attempts 
    # merge using the city variable in world cities 
    mrg, leftovers = meatloaf(enpop[['popularity','genre','city','country','country code','country code 3']],
                              worldcities[['city','lat','lng','country code 3','population']],
                              left_on=['city','country code 3'],
                              right_on=['city','country code 3'])
    enpop_cities = enpop_cities.append(mrg, ignore_index=True, sort=False)

    # merge using the city_ascii variable in world cities 
    mrg, leftovers = meatloaf(leftovers,
                              worldcities[['city_ascii','lat','lng','country code 3','population']],
                              left_on=['city','country code 3'],
                              right_on=['city_ascii','country code 3'])
    enpop_cities = enpop_cities.append(mrg, ignore_index=True, sort=False)

    # merge using the admin_name variable in world cities 
    mrg, leftovers = meatloaf(leftovers,
                              worldcities[['admin_name','lat','lng','country code 3','population']],
                              left_on=['city','country code 3'],
                              right_on=['admin_name','country code 3'])
    enpop_cities = enpop_cities.append(mrg, ignore_index=True, sort=False)

    # for the leftovers of the exact matching, 
    # retrieve a fuzzy match between unique city-countries in the leftovers and each worldcities variables 
    merge_1 = fuzzymatcher.fuzzy_left_join(leftovers.groupby(['city','country code 3'],as_index=False)['country'].first(), 
                                           worldcities[['city','lat','lng','country code 3','population']], 
                                           ['city','country code 3'], 
                                           ['city','country code 3'])
    merge_2 = fuzzymatcher.fuzzy_left_join(leftovers.groupby(['city','country code 3'],as_index=False)['country'].first(), 
                                           worldcities[['city_ascii','lat','lng','country code 3','population']], 
                                           ['city','country code 3'], 
                                           ['city_ascii','country code 3'])
    merge_3 = fuzzymatcher.fuzzy_left_join(leftovers.groupby(['city','country code 3'],as_index=False)['country'].first(), 
                                           worldcities[['admin_name','lat','lng','country code 3','population']], 
                                           ['city','country code 3'], 
                                           ['admin_name','country code 3'])

    # concatenate the similarity scores for each merge so they can be compared
    fuzzy_eval = pd.concat([merge_1.reset_index().rename(columns={'best_match_score':'bms1','index':'idx1'})[['bms1','idx1']],     
                            merge_2.reset_index().rename(columns={'best_match_score':'bms2','index':'idx2'})[['bms2','idx2']],
                            merge_3.reset_index().rename(columns={'best_match_score':'bms3','index':'idx3'})[['bms3','idx3']]], 1)

    # find the columns (merge) with the maximum similarity for each city-country combo
    fuzzy_eval['max'] = fuzzy_eval[[c for c in fuzzy_eval if 'bms' in c]].idxmax(axis=1)    
    
    # for each merge, retrieve the indices where they had the maximum similarity out of the three merges 
    # merge that to the enpop_cities dataset, retrieve the leftovers and proceed 
    mrg1 = merge_1.loc[fuzzy_eval.loc[fuzzy_eval['max']=='bms1']['idx1']][['city_left',
                                                                           'city_right',
                                                                           'lat',
                                                                           'lng',
                                                                           'population',
                                                                           'country code 3_left']].rename(columns={'city_left':'city',
                                                                                                                   'city_right':'fuzzy_city',
                                                                                                                   'country code 3_left':'country code 3'})
    mrg, leftovers = meatloaf(leftovers,
                              mrg1,
                              left_on=['city','country code 3'],
                              right_on=['city','country code 3'])
    enpop_cities = enpop_cities.append(mrg, ignore_index=True, sort=False)
    # second merge
    mrg2 = merge_2.loc[fuzzy_eval.loc[fuzzy_eval['max']=='bms2']['idx2']][['city',
                                                                           'city_ascii',
                                                                           'lat',
                                                                           'lng',
                                                                           'population',
                                                                           'country code 3_left']].rename(columns={'city_ascii':'fuzzy_city',
                                                                                                                   'country code 3_left':'country code 3'})
    mrg, leftovers = meatloaf(leftovers,
                              mrg2,
                              left_on=['city','country code 3'],
                              right_on=['city','country code 3'])
    enpop_cities = enpop_cities.append(mrg, ignore_index=True, sort=False)
    # third merge
    mrg3 = merge_3.loc[fuzzy_eval.loc[fuzzy_eval['max']=='bms3']['idx3']][['city',
                                                                           'admin_name',
                                                                           'lat',
                                                                           'lng',
                                                                           'population',
                                                                           'country code 3_left']].rename(columns={'admin_name':'fuzzy_city',
                                                                                                                   'country code 3_left':'country code 3'})
    mrg, leftovers = meatloaf(leftovers,
                              mrg3,
                              left_on=['city','country code 3'],
                              right_on=['city','country code 3'])
    enpop_cities = enpop_cities.append(mrg, ignore_index=True, sort=False)
    
    # return the maximized match and any leftovers
    return enpop_cities, leftovers 
Example #21
0
df_addr['search_string'] = df_addr['address'] + ',' + df_addr['county']

df = pd.read_csv(config['dir_input'] + 'ie-towns.csv')
df = df[['name', 'county', 'eircode']].drop_duplicates()
df['search_string'] = df['name'] + ',' + df['county']
df = df[['search_string', 'eircode']]

for i in df:
    df[i] = df[i].str.upper()

# fuzzy match way

#fuzzymatcher.fuzzy_left_join(df_left, df_right, left_on = "ons_name", right_on = "os_name")

df1 = fuzzymatcher.fuzzy_left_join(df_addr,
                                   df,
                                   left_on="search_string",
                                   right_on="search_string")
print(df1)

# subset way

# for index, row in df_addr.iterrows():

#     eircode = None
#     result_set = df[(df['county'] == row['county']) & (df['postal_town'] == row['address'])]

#     for z in result_set['eircode']:
#         eircode = str(z)

#     print( row['address'], eircode)
def main():
    print('CD Scripting Node')

    # start in development mode where nodeargs are given explicitely rather than reading it as command line argument
    if sys.argv[1] == '-devel':
        print(f'Development mode: Current Dir is {os.getcwd()}')
        nodeargs_path = 'node_args.json'
    else:
        nodeargs_path = sys.argv[1]

    # parse node args from Compound Discoverer and extract location of ChemSpider Results table
    try:
        with open(nodeargs_path, 'r') as rf:
            nodeargs = json.load(rf)
            features_path = ''
            response_path = nodeargs['ExpectedResponsePath']
            tables = nodeargs['Tables']
            for table in tables:
                if table['TableName'] == 'ChemSpider Results':
                    features_path = table['DataFile']
                    if table['DataFormat'] != 'CSV':
                        print_error(f"Unknown Data Format {table['DataFormat']}")
                        exit(1)
    except Exception as e:
        print_error('Could not read Compound Discoverer node args')
        print_error(str(e))
        exit(1)

    if not features_path:
        print_error('ChemSpider Results file not defined in node args.')
        exit(1)

    try:
    # if 1 > 0:
        with open(features_path, mode='r') as protFile:
            reader = csv.DictReader(protFile, delimiter='\t')
            df = pd.DataFrame(reader)

            inKey = list()
            for idx, sd in enumerate(df['Structure']):
                # print(idx)
                F = open("temp.sdf", "w")
                F.writelines(sd)
                F.close()
                suppl = Chem.SDMolSupplier('temp.sdf')
                mol = next(suppl)
                if mol == None:
                    inKey.append(np.nan)
                else:
                    inKey.append(Chem.MolToInchiKey(mol))

            inKey = pd.DataFrame(inKey)
            inKey.columns = ['InChIKey']
            CD = pd.concat([df, inKey], axis=1, sort=False)

    # Load the parse HMDB file
        with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Parser_HMDB.py Output/hmdb_metabolites.json', 'r') as read_file:
            data = json.load(read_file)

        df_hmdb = pd.DataFrame(data)
        df_hmdb.drop(['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'], axis=1)

        df_excel = CD

        # Remove spaces between letters on  'Formula' ( there is  a warning)
        CD = CD.copy()
        CD.loc[:, ('Formula')] = CD['Formula'].str.replace(" ", "")

        joindata_by_inchikey = pd.merge(left=df_excel, right=df_hmdb, how='inner', left_on='InChIKey', right_on='inchikey')
        # Reduce the rows to those we DID find a match by inchkey in bothe data sets
        df_hmdb_reduce_byinchik = df_hmdb.loc[~df_hmdb['inchikey'].isin(df_excel['InChIKey'])]
        df_excel_reduce_byinchik = df_excel.loc[~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])]

        # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
        joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik, df_hmdb_reduce_byinchik, left_on="Name",
                                                        right_on="name")
        # Selecting threshold  best_match_score>0.25 maybe adjustments needed
        joindata_by_name = joindata_by_name[joindata_by_name['best_match_score'] > 0.85]

        # Drop columns the
        joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'], axis=1, inplace=True)

        # Reduce the rows to those we DID find a match by inchkey in and by name both data sets
        df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[
            ~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])]
        df_excel_reduce_byname = df_excel_reduce_byinchik.loc[
            ~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])]

        # Merge by chemical_formula
        joindata_by_CF = pd.merge(left=df_excel_reduce_byname, right=df_hmdb_reduce_byname, how='inner', left_on='Formula',
                                  right_on='chemical_formula')
        # This data inculed rows from the original EXCEL file that we did NOT find and match ( by inchikey nor name nor CF)
        df_excel_reduce_byCF = df_excel_reduce_byname.loc[
            ~df_excel_reduce_byname['Formula'].isin(joindata_by_CF['chemical_formula'])]

        # Create a list of all columns of the HMDB JSON data
        colnames = joindata_by_inchikey.columns[13:]

        # Add those names as empty columns to the df_excel_reduce_byCF. reducedata in all the rows from the original Excel
        # that did NOT find a match and added the columns of the HMDB
        reducedata = df_excel_reduce_byCF.reindex(columns=[*df_excel_reduce_byCF.columns.tolist(), *colnames])

        out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF.append(reducedata)))

        # remove duplicates columns names
        out = out.drop(columns=['name', 'smiles', 'inchikey'])

        # out = out.drop(out.iloc[:,30: ])
        out.columns = [x.strip() for x in out.columns]
        # out =out.loc[: ,"ChemSpider Results CSID":"Thyroid cancer"]
    except Exception as e:
        print_error('Could not process data')
        print_error(e)
        exit(1)

    # write data file
    outfilename = "ChemSpiderResultsWithInChIKey.txt"
    (workdir, _) = os.path.split(response_path)
    outfile_path = os.path.join(workdir, outfilename)
    out.to_csv(outfile_path, sep='\t', index=False)

    # entries for new column in Features table
    response = ScriptingResponse()
    response.add_table('ChemSpider Results', outfile_path)

    # select only the columns we want to add
    for indx, colname in enumerate(out.columns[12:]):
        response.add_column('ChemSpider Results', colname, 'String')
        # print(indx, colname)

    # save to disk
    response.save(response_path)
Example #23
0
    'CapitalLatitude', 'CapitalLongitude', 'CountryCode'
]]

languages = {
    'en': ['Country', 'Code', 'Capital', 'Demonym', 'Adjective', 'Currency'],
    'de': [
        'Land', 'Ländercode', 'Hauptstadt', 'Personenbezeichnung', 'Adjektiv',
        'Währung'
    ],
    'fr': ['Pays', 'Code2', 'Capitale', 'Gentilé', 'Adjectif', 'Monnaie'],
    'pt': ['País', 'Código', 'Capital', 'Gentílico', 'Adjetivo', 'Moeda'],
    'sk': [
        'Krajina', 'Kód', 'Hlavné mesto', 'Obyvateľské meno', 'Prídavné meno',
        'Mena'
    ],
}

for lang in languages:

    df = pd.read_csv(cwd + '/countries_' + lang + '.csv', na_filter=False)

    coden = languages[lang][1]
    df_wc = fuzzymatcher.fuzzy_left_join(df,
                                         coordinates,
                                         left_on=coden,
                                         right_on='CountryCode')

    df_wc.drop(df_wc.columns[[0, 1, 2, 13]], axis=1, inplace=True)

    df_wc.to_csv('countries_' + lang + '_coord.csv', sep=',', index=False)