Beispiel #1
0
def composeWorkplaceOntology():

    import ossPyFuncs

    import pandas as pd
    postgreSql_selectQuery = "SELECT * FROM us_gov_manual.us_govman_2019 ;"

    govTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    postgreSql_selectQuery = "SELECT institution FROM hipolabs.universities ;"

    univTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    combinedSeries = [govTable['AgencyName'], univTable['institution']]

    fullWordbank = pd.concat(combinedSeries)

    wordbankTable = pd.DataFrame(fullWordbank)

    return wordbankTable
Beispiel #2
0
def composeWorkplaceOntology():
    """Create a table featuring valid workplace institutions
    """

    import ossPyFuncs
    import pandas as pd

    #mysql query to extract full table from government organizations
    #certian table columns feature capital letters which cases uproblems
    postgreSql_selectQuery = "SELECT * FROM us_gov_manual.us_govman_2019 ;"
    #pass querry and obtain table
    govTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    #mysql query to obtain academic instutions
    postgreSql_selectQuery = "SELECT institution FROM hipolabs.universities ;"
    #pass querry and obtain table
    univTable = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    postgreSql_selectQuery = "SELECT company FROM forbes.fortune2018_us1000;"
    businesses1 = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    postgreSql_selectQuery = "SELECT company FROM forbes.fortune2019_us1000;"
    businesses2 = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    postgreSql_selectQuery = "SELECT company FROM forbes.fortune2020_global2000;"
    businesses3 = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

    #combine theinsitutions into a vector
    combinedSeries = [
        govTable['AgencyName'], univTable['institution'],
        businesses1['company'], businesses2['company'], businesses3['company']
    ]
    #turn the multi item vector into a single series
    fullWordbank = pd.concat(combinedSeries)
    #turn that series into a pd dataframe
    wordbankTable = pd.DataFrame(fullWordbank.unique())

    return wordbankTable
Beispiel #3
0
Created on Thu Jun 25 09:54:55 2020

@author: dnb3k
"""

import ossPyFuncs
import pandas as pd
import wordcloud
import re
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns

postgreSql_selectQuery="SELECT company FROM gh.ctrs_raw ;"
inputRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

#perform sql query to get company column
postgreSql_selectQuery="SELECT local_language_abbreviation FROM gleif.legal_entities;"
legalEntitiesRaw=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
longLine=legalEntitiesRaw['local_language_abbreviation'].str.cat(sep=';')
longLineSeparated=pd.DataFrame(longLine.split(';'))
uniqueFrame=pd.DataFrame(longLineSeparated[0].unique())
#uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True)
#uniqueFrame=pd.DataFrame(uniqueFrame[0][~uniqueFrame[0].str.contains('(?i)^co\.$|^co$')]).reset_index(drop=True)

sqlQueryFormattedFrame=pd.DataFrame('(?i)\\b'+uniqueFrame[0].astype(str)+'\\b')

inputColumn, eraseList=ossPyFuncs.eraseFromColumn(inputRaw['company'],sqlQueryFormattedFrame)

eraseList.sort_values(by=['changeNum'],ascending=False,inplace=True)
Beispiel #4
0
this script generates a table containing the unique sub-tokens (i.e. 
individual words) found in workplace names

@author: dnb3k
"""

import ossPyFuncs
import pandas as pd
import wordcloud
import re
import matplotlib.pyplot as plt
import os

#perform sql query to get company column
postgreSql_selectQuery = "SELECT company FROM gh.ctrs_raw ;"
inputRaw = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

#obtain the eralse list
currentDir = os.path.dirname('ossPyFuncs.py')
eraseList = pd.read_csv(os.path.join(currentDir, 'keyFiles/eraseStrings.csv'),
                        quotechar="'")
#apply the erase list
semiCleanedOutput = pd.DataFrame(
    ossPyFuncs.eraseFromColumn(inputRaw['company'], eraseList))

#cat together all user's workplace names (note, we are not applying unique first)
longString = semiCleanedOutput['company'].str.cat(sep=' ')

#separate each word into a extremely long list
longStringSeparated = longString.split(' ')
Beispiel #5
0
"""
Created on Fri Jun 12 08:16:24 2020
This function creates a plot that depicts the number of people that work at
a company that has some number of employees associated with it.  

@author: dnb3k
"""
import ossPyFuncs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

#form and perform the query
postgreSql_selectQuery = "SELECT login, company FROM gh.ctrs_raw ;"
result = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)

#obtain the eralse list
currentDir = os.path.dirname('ossPyFuncs.py')
eraseList = pd.read_csv(os.path.join(currentDir,
                                     'keyFiles/eraseStrings_v6.csv'),
                        quotechar="'",
                        header=None)
#apply the erase list
semiCleanedOutput = pd.DataFrame(
    ossPyFuncs.eraseFromColumn(eraseList['company'], eraseList))
#apply a lower to increase convergence/overlap
lowerInput = pd.DataFrame(semiCleanedOutput['company'].str.lower())

#get the unique counts
companyCounts = lowerInput['company'].value_counts()
Beispiel #6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 14:21:06 2020

@author: dnb3k
"""
import pandas as pd
import ossPyFuncs

remapTable = pd.read_csv('workplaceMapping.csv')

postgreSql_selectQuery = "SELECT company FROM gh.ctrs_raw ;"

inputColumn = ossPyFuncs.queryToPDTable(postgreSql_selectQuery)


def remapColumnValuesfromTable(inputColumn, remapTable):

    import ossPyFuncs
    import pandas as pd
    import numpy as np
    import difflib

    gitWorkplaceCounts = inputColumn['company'].value_counts()
    sortedTable = gitWorkplaceCounts.reset_index()
    sortedTable.rename(columns={
        "index": "company name",
        "company": "count"
    },
                       inplace=True)