def main(argv): pd_file = argv[1] country = argv[2] territory = argv[3] ignore_words_cls = IgnoreWords() postgres_interface_cls = PostgresInterface() df_cls = PandaDataFrame(pd_file) for extracted_row in extract_row_generator(df_cls.df): index, row = extracted_row ## Clean company_name company_name = clean(row["Company Name"]).lower() ## Ignore words company_keywords_list = ignore_words_cls.return_keyword_lists(company_name) print ("*****************") print (row["Company Name"]) print (company_keywords_list) ## Find matches in DB using keywords , country and territory crm_results = postgres_interface_cls.get_record_match(company_name, company_keywords_list, country, territory) ## Fuzzy match best_match = Match(crm_company_id = "",crm_company_name = "",crm_group_id = "", score="") best_score = 0 best_match, best_score = call_fuzzy_match_generator(best_match, best_score, row["Company Name"].lower(), crm_results) ## Test Prints # if best_score >= 75: # #print ("keword_list: {} + crm_results count: {}".format(company_keywords_list,len(crm_results))) # print("{} => best_match: {} => '{}'".format(best_score, row["Company Name"], best_match)) print("{} => best_match: {}".format(best_score, best_match)) ### Generate new file df_cls.update_df(index, best_match, best_score)
def test_ignore_words(self): """ Ignore words. Returned keywords list are used to query similar companies in database to narrow number of records to apply fuzzy match against. """ ignore_words_cls = IgnoreWords() self.assertEqual(sorted(["hada", "hada general trading","tradingl.l.c"]), sorted(ignore_words_cls.return_keyword_lists("Hada General TradingL.L.C".lower()))) self.assertEqual(sorted(["dst","globalmiddle", "east", "dst globalmiddle east"]), sorted(ignore_words_cls.return_keyword_lists("Dst GlobalMiddle East Limited".lower()))) self.assertEqual(sorted(["jacky's","jacky's gulf"]), sorted(ignore_words_cls.return_keyword_lists("Jacky's Gulf Fze".lower()))) self.assertEqual(sorted(["emirates trading"]), sorted(ignore_words_cls.return_keyword_lists("Emirates Trading Est.".lower()))) self.assertEqual(sorted(["mena","mena business services"]), sorted(ignore_words_cls.return_keyword_lists("Mena Business Services Fz-Llc".lower()))) self.assertEqual(sorted(["shokri","hassan","shokri hassan trading"]), sorted(ignore_words_cls.return_keyword_lists("Shokri Hassan Trading Co. L.L. C.".lower()))) self.assertEqual(sorted(["danube","bulding","danube bulding materials"]), sorted(ignore_words_cls.return_keyword_lists("Danube Bulding Materials Fzco.".lower()))) self.assertEqual(sorted(["alokozay","alokozay international"]), sorted(ignore_words_cls.return_keyword_lists("Alokozay International Ltd.".lower()))) self.assertEqual(sorted(["malcolm","pirnie","malcolm pirnie middle east"]), sorted(ignore_words_cls.return_keyword_lists("Malcolm Pirnie Middle East FZC".lower()))) self.assertEqual(sorted(["ojaco","ojaco engineering"]), sorted(ignore_words_cls.return_keyword_lists("Ojaco Engineering Co.".lower()))) self.assertEqual(sorted(["jaber","alec","al jaber l e g t engineering & contracting alec"]), sorted(ignore_words_cls.return_keyword_lists("Al Jaber L E G T Engineering & Contracting Alec L L C".lower()))) self.assertEqual(sorted(["arabtec","arabtec holding"]), sorted(ignore_words_cls.return_keyword_lists("Arabtec Holding PJSC".lower()))) self.assertEqual(sorted(["advanced","pipes","casts","advanced pipes and casts company"]), sorted(ignore_words_cls.return_keyword_lists("Advanced Pipes and Casts Company W.L.L.".lower()))) self.assertEqual(sorted(["smith","smith international"]), sorted(ignore_words_cls.return_keyword_lists("Smith International Inc.".lower()))) self.assertEqual(sorted(["thyssenkrupp","xervon","thyssenkrupp xervon u.a.e."]), sorted(ignore_words_cls.return_keyword_lists("ThyssenKrupp Xervon U.A.E. L.L.C.".lower()))) self.assertEqual(sorted(["noor","al noor hospitals group",]), sorted(ignore_words_cls.return_keyword_lists("Al Noor Hospitals Group PLC".lower()))) self.assertEqual(sorted(["g.i.t"]), sorted(ignore_words_cls.return_keyword_lists("G.I.T Fze".lower()))) self.assertEqual(sorted(["linde","linde engineering middle east",]), sorted(ignore_words_cls.return_keyword_lists("Linde Engineering Middle East LLC".lower()))) self.assertEqual(sorted(["emco","maintenance","engineering maintenance company emco"]), sorted(ignore_words_cls.return_keyword_lists("Engineering Maintenance Company EMCO".lower()))) self.assertEqual(sorted(["moherbie","thermoplast","al moherbie thermoplast"]), sorted(ignore_words_cls.return_keyword_lists("Al Moherbie Thermoplast LLC".lower()))) self.assertEqual(sorted(["gibca","gibtek", "gibca information technology gibtek"]), sorted(ignore_words_cls.return_keyword_lists("Gibca Information Technology L L C Gibtek".lower()))) self.assertEqual(sorted(["y&r","y&r abu dhabi"]), sorted(ignore_words_cls.return_keyword_lists("Y&R Abu Dhabi".lower()))) self.assertEqual(sorted(["tolico","tolico trading oilfield services"]), sorted(ignore_words_cls.return_keyword_lists("Tolico Trading Oilfield Services L L C".lower())))
def test_ignore_words(self): """ Ignore words. Returned keywords list are used to query similar companies in database to narrow number of records to apply fuzzy match against. """ ignore_words_cls = IgnoreWords() self.assertEqual( sorted(["hada", "hada general trading", "tradingl.l.c"]), sorted( ignore_words_cls.return_keyword_lists( "Hada General TradingL.L.C".lower()))) self.assertEqual( sorted(["dst", "globalmiddle", "east", "dst globalmiddle east"]), sorted( ignore_words_cls.return_keyword_lists( "Dst GlobalMiddle East Limited".lower()))) self.assertEqual( sorted(["jacky's", "jacky's gulf"]), sorted( ignore_words_cls.return_keyword_lists( "Jacky's Gulf Fze".lower()))) self.assertEqual( sorted(["emirates trading"]), sorted( ignore_words_cls.return_keyword_lists( "Emirates Trading Est.".lower()))) self.assertEqual( sorted(["mena", "mena business services"]), sorted( ignore_words_cls.return_keyword_lists( "Mena Business Services Fz-Llc".lower()))) self.assertEqual( sorted(["shokri", "hassan", "shokri hassan trading"]), sorted( ignore_words_cls.return_keyword_lists( "Shokri Hassan Trading Co. L.L. C.".lower()))) self.assertEqual( sorted(["danube", "bulding", "danube bulding materials"]), sorted( ignore_words_cls.return_keyword_lists( "Danube Bulding Materials Fzco.".lower()))) self.assertEqual( sorted(["alokozay", "alokozay international"]), sorted( ignore_words_cls.return_keyword_lists( "Alokozay International Ltd.".lower()))) self.assertEqual( sorted(["malcolm", "pirnie", "malcolm pirnie middle east"]), sorted( ignore_words_cls.return_keyword_lists( "Malcolm Pirnie Middle East FZC".lower()))) self.assertEqual( sorted(["ojaco", "ojaco engineering"]), sorted( ignore_words_cls.return_keyword_lists( "Ojaco Engineering Co.".lower()))) self.assertEqual( sorted([ "jaber", "alec", "al jaber l e g t engineering & contracting alec" ]), sorted( ignore_words_cls.return_keyword_lists( "Al Jaber L E G T Engineering & Contracting Alec L L C". lower()))) self.assertEqual( sorted(["arabtec", "arabtec holding"]), sorted( ignore_words_cls.return_keyword_lists( "Arabtec Holding PJSC".lower()))) self.assertEqual( sorted([ "advanced", "pipes", "casts", "advanced pipes and casts company" ]), sorted( ignore_words_cls.return_keyword_lists( "Advanced Pipes and Casts Company W.L.L.".lower()))) self.assertEqual( sorted(["smith", "smith international"]), sorted( ignore_words_cls.return_keyword_lists( "Smith International Inc.".lower()))) self.assertEqual( sorted(["thyssenkrupp", "xervon", "thyssenkrupp xervon u.a.e."]), sorted( ignore_words_cls.return_keyword_lists( "ThyssenKrupp Xervon U.A.E. L.L.C.".lower()))) self.assertEqual( sorted([ "noor", "al noor hospitals group", ]), sorted( ignore_words_cls.return_keyword_lists( "Al Noor Hospitals Group PLC".lower()))) self.assertEqual( sorted(["g.i.t"]), sorted(ignore_words_cls.return_keyword_lists("G.I.T Fze".lower()))) self.assertEqual( sorted([ "linde", "linde engineering middle east", ]), sorted( ignore_words_cls.return_keyword_lists( "Linde Engineering Middle East LLC".lower()))) self.assertEqual( sorted([ "emco", "maintenance", "engineering maintenance company emco" ]), sorted( ignore_words_cls.return_keyword_lists( "Engineering Maintenance Company EMCO".lower()))) self.assertEqual( sorted(["moherbie", "thermoplast", "al moherbie thermoplast"]), sorted( ignore_words_cls.return_keyword_lists( "Al Moherbie Thermoplast LLC".lower()))) self.assertEqual( sorted(["gibca", "gibtek", "gibca information technology gibtek"]), sorted( ignore_words_cls.return_keyword_lists( "Gibca Information Technology L L C Gibtek".lower()))) self.assertEqual( sorted(["y&r", "y&r abu dhabi"]), sorted( ignore_words_cls.return_keyword_lists( "Y&R Abu Dhabi".lower()))) self.assertEqual( sorted(["tolico", "tolico trading oilfield services"]), sorted( ignore_words_cls.return_keyword_lists( "Tolico Trading Oilfield Services L L C".lower())))