import re import matplotlib.pyplot as plt import os import numpy as np import seaborn as sns tpis=sns.load_dataset("tips") #perform sql query to get company column postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;" inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) currentDir=os.path.dirname('ossPyFuncs.py') replaceList=pd.read_csv(os.path.join(currentDir,'keyFiles/expandAbrevs.csv'),quotechar="'",header=None) semiCleanedOutput=pd.DataFrame(ossPyFuncs.expandFromColumn(inputRaw['company'],replaceList)) #obtain the eralse list currentDir=os.path.dirname('ossPyFuncs.py') eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'") #apply the erase list semiCleanedOutput=pd.DataFrame(ossPyFuncs.eraseFromColumn(semiCleanedOutput['company'],eraseList)) #get the counts for the unique values tableUniqueFullNameCounts=semiCleanedOutput.iloc[:,0].value_counts() #convert that output to a proper table tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index() #rename the columns tableUniqueFullNameCounts.rename(columns={"company":"count","index":"company"},inplace=True)
import ossPyFuncs import pandas as pd import wordcloud import re import matplotlib.pyplot as plt import os import nltk #perform sql query to get company column postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;" inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) currentDir=os.path.dirname('ossPyFuncs.py') replaceList=pd.read_csv(os.path.join(currentDir,'keyFiles/expandAbrevs.csv'),quotechar="'",header=None) inputColumn, replaceList=ossPyFuncs.expandFromColumn(inputRaw['company'],replaceList) #obtain the eralse list currentDir=os.path.dirname('ossPyFuncs.py') eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'",header=None) #apply the erase list semiCleanedOutput=pd.DataFrame(ossPyFuncs.eraseFromColumn(inputRaw['company'],eraseList)) #cat together all user's workplace names (note, we are not applying unique first) longString=inputRaw['company'].str.cat(sep=' ') #separate each word into a extremely long list longStringSeparated=longString.split(' ') #turn it into a dataframe uniqueSubTokenFrame=pd.DataFrame(longStringSeparated)
def spaceSymbolRemap(inputColumn): """remapps entries with same space and symbol free string to most common element Keyword arguments: inputColumn -- a column from a pandas dataframe, presumably with duplicate entires, as frequency will guide this process. space/symbol/case variants of the same string will be remapped to most common element """ import pandas as pd import re import numpy as np import ossPyFuncs #get the input column names inputColumnName = inputColumn.columns #get the unique values (and counts) tableUniqueFullNameCounts = inputColumn[inputColumnName[0]].value_counts() #convert that output to a proper table tableUniqueFullNameCounts = tableUniqueFullNameCounts.reset_index() #rename the columns tableUniqueFullNameCounts.rename(columns={ inputColumnName[0]: "count", "index": inputColumnName[0] }, inplace=True) tableUniqueFullNameCounts = tableUniqueFullNameCounts.sort_values( by=['count', inputColumnName[0]], ascending=[False, False]) tableUniqueFullNameCounts = tableUniqueFullNameCounts.reset_index( drop=True) uniqueNoSpaceSymbol = pd.DataFrame( tableUniqueFullNameCounts[inputColumnName[0]].str.replace( '[^a-zA-Z0-9]', '')) tableUniqueFullNameCounts['remapping'] = '' #iterate across entries with guesses for index, row in tableUniqueFullNameCounts.iterrows(): #set current entry number currentEntry = tableUniqueFullNameCounts[inputColumnName[0]].loc[index] #get the lowercase form of it #currentLower=currentEntry.lower() #extract current string from company vector currentNoSpaceOrSymbol = re.sub('\\W', '', currentEntry) #extract what may be a list of guesses noSpaceSymbolMatches = uniqueNoSpaceSymbol[ inputColumnName[0]].str.contains('(?i)\\b' + currentNoSpaceOrSymbol + '\\b') #find the counts of the entires that match up with this, use the wisdom of the crowds currentCounts = tableUniqueFullNameCounts['count'].loc[ noSpaceSymbolMatches] #find the listing of the label with the max frequency #make an array of it indexFrame = currentCounts.reset_index() #find the index currentIndex = indexFrame.loc[( indexFrame['count'] == np.max(currentCounts))] if (not index == currentIndex['index'].iloc[0] ) and len(currentNoSpaceOrSymbol) > 0: #extract the name that is to be remapped to mappedName = tableUniqueFullNameCounts[inputColumnName[0]].loc[ currentIndex['index'].iloc[0]] #place it in the table tableUniqueFullNameCounts.at[index, 'remapping'] = mappedName print('Remaping identification complete') #find where you need to perform regex replacements remapPresent = tableUniqueFullNameCounts['remapping'].str.len() > 0 #create subtable for things to replace replacementSubtable = tableUniqueFullNameCounts.loc[remapPresent] #use the replacement function to replace the relevant items fixedList, fixedReport = ossPyFuncs.expandFromColumn( inputColumn, pd.DataFrame(replacementSubtable['company', 'remapping'])) print('remapping complete') return fixedList, fixedReport