postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;" inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #perform sql query to get company column postgreSql_selectQuery="SELECT local_language_abbreviation FROM gleif.legal_entities;" legalEntitiesRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) longLine=legalEntitiesRaw['local_language_abbreviation'].str.cat(sep=';') longLineSeparated=pd.DataFrame(longLine.split(';')) uniqueFrame=pd.DataFrame(longLineSeparated[0].unique()) #uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True) #uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True) sqlQueryFormattedFrame=pd.DataFrame('(?i)\\b'+uniqueFrame[0].astype(str)+'\\b') inputColumn, eraseList=ossPyFuncs.eraseFromColumn(inputRaw['company'],sqlQueryFormattedFrame) eraseList.sort_values(by=['changeNum'],ascending=False,inplace=True) eraseList.reset_index(drop=True,inplace=True) longLine=barAbbreviations[0].str.cat(sep='|') #formulate a good regex expression currentRegex=re.compile('(?i)\\b'+longLine+'\\b') #get all company listings that feature the current company string test5=uniqueFrame[uniqueFrame[0].str.contains('(цак)\1{9,}')]
import wordcloud import re import matplotlib.pyplot as plt import os #perform sql query to get company column postgreSql_selectQuery = "SELECT company FROM gh.ctrs_raw ;" inputRaw = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #obtain the eralse list currentDir = os.path.dirname('ossPyFuncs.py') eraseList = pd.read_csv(os.path.join(currentDir, 'keyFiles/eraseStrings.csv'), quotechar="'") #apply the erase list semiCleanedOutput = pd.DataFrame( ossPyFuncs.eraseFromColumn(inputRaw['company'], eraseList)) #cat together all user's workplace names (note, we are not applying unique first) longString = semiCleanedOutput['company'].str.cat(sep=' ') #separate each word into a extremely long list longStringSeparated = longString.split(' ') #turn it into a dataframe uniqueSubTokenFrame = pd.DataFrame(longStringSeparated) #get the count on that column columnUniqueCounts = uniqueSubTokenFrame.iloc[:, 0].value_counts() #convert that output to a proper table tableUniqueCounts = columnUniqueCounts.reset_index() tableUniqueCounts.rename(columns={0: "count", "index": "token"}, inplace=True)
import wordcloud import re import matplotlib.pyplot as plt import os #perform sql query to get company column postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;" inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #force case insensitivity lowerInput=pd.DataFrame(inputRaw['company'].str.lower()) #obtain the eralse list currentDir=os.path.dirname('ossPyFuncs.py') eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'") #apply the erase list semiCleanedOutput=ossPyFuncs.eraseFromColumn(lowerInput['company'],eraseList) #replace interior spaces and periods (which the wordcloud splits at) spacesReplaced=semiCleanedOutput.str.replace(' ','_') periodsReplaced=spacesReplaced.str.replace('\.','_') #turn that output into a long string longString=periodsReplaced.str.cat(sep=' ') #generate a wordcloud and convert it to svg outcloud=wordcloud.WordCloud(width=2000, height=1000, max_words=2000).generate(longString) svgCloud=outcloud.to_svg() #save it down as an svg svgOut=open(os.path.join(currentDir,'figures/wordcloud.svg'),"w")
import nltk #perform sql query to get company column postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;" inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery) currentDir=os.path.dirname('ossPyFuncs.py') replaceList=pd.read_csv(os.path.join(currentDir,'keyFiles/expandAbrevs.csv'),quotechar="'",header=None) inputColumn, replaceList=ossPyFuncs.expandFromColumn(inputRaw['company'],replaceList) #obtain the eralse list currentDir=os.path.dirname('ossPyFuncs.py') eraseList=pd.read_csv(os.path.join(currentDir,'keyFiles/eraseStrings_v6.csv'),quotechar="'",header=None) #apply the erase list semiCleanedOutput=pd.DataFrame(ossPyFuncs.eraseFromColumn(inputRaw['company'],eraseList)) #cat together all user's workplace names (note, we are not applying unique first) longString=inputRaw['company'].str.cat(sep=' ') #separate each word into a extremely long list longStringSeparated=longString.split(' ') #turn it into a dataframe uniqueSubTokenFrame=pd.DataFrame(longStringSeparated) #get the count on that column columnUniqueCounts=uniqueSubTokenFrame.iloc[:,0].value_counts() #convert that output to a proper table tableUniqueCounts=columnUniqueCounts.reset_index() #reset the names
import matplotlib.pyplot as plt import os #form and perform the query postgreSql_selectQuery = "SELECT login, company FROM gh.ctrs_raw ;" result = ossPyFuncs.queryToPDTable(postgreSql_selectQuery) #obtain the eralse list currentDir = os.path.dirname('ossPyFuncs.py') eraseList = pd.read_csv(os.path.join(currentDir, 'keyFiles/eraseStrings_v6.csv'), quotechar="'", header=None) #apply the erase list semiCleanedOutput = pd.DataFrame( ossPyFuncs.eraseFromColumn(eraseList['company'], eraseList)) #apply a lower to increase convergence/overlap lowerInput = pd.DataFrame(semiCleanedOutput['company'].str.lower()) #get the unique counts companyCounts = lowerInput['company'].value_counts() #establish the binvals binVals = np.asarray([0, 1, 5, 10, 20, 50, 100, 200, np.max(companyCounts)]) #iterate to sum the number of employees meeting the criterion binSum = np.zeros([len(binVals) - 1, 1]) for iBins in range(len(binVals) - 1): binSum[iBins] = sum(companyCounts[np.logical_and( companyCounts > binVals[iBins], companyCounts <= binVals[iBins + 1])])