Python CategoryToNumberAssignment Examples, CategoryToNumberAssignment Python Examples

Example #1

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDataFramesRangeSaving(StartDate, EndDate, DateStep, Gender='None', SaveEvery=1, File='OBOextract', Path='Data/Raw/', Overwrite = 1):
    """Generate the dataframes from a gender and datelist"""
    import os
    import pandas as pd
    import urllib3
    retry = urllib3.util.Retry(total=100, read=100, connect=100, backoff_factor=1)
    timeout = urllib3.util.Timeout(connect=4.0, read=8.0)
    http=urllib3.PoolManager(retry=retry, timeout=timeout, maxsize=5)

    _Iteration = 0
    
    DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    if Overwrite is 0:
        # Does the file exist
        if os.path.isfile(Path + File+'.csv'):
            try:
                DummyFrame = pd.DataFrame.from_csv(Path + File+'.csv')
                _TempStartDate = int(DummyFrame.tail(1).values[0,0] + DateStep)
                if _TempStartDate >= StartDate:
                    StartDate = _TempStartDate + DateStep
            except:
                DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    for _startDate in range(StartDate,EndDate+1,DateStep):
        _Iteration += 1
        _endDate= _startDate+DateStep-1
        print('Generating row for {} to {}, Gender: {}'.format(_startDate,_endDate,Gender))
        DummyFrame.loc[len(DummyFrame)] = generateRowRange(_startDate, _endDate, Gender)
        if _Iteration >= SaveEvery:
            _Iteration = 0
            DummyFrame.to_csv(Path + File+'.csv')
            
    return DummyFrame

Example #2

0

Show file

File: OBOPairwise.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateHeadings(OffenceArray=c2n.offcat):
    """Generate the pairwise comparisons on an array, OffenceArray
    
    Returns
    -------
        array of strings
    """
    OffenceArrayLen=len(OffenceArray)
    HeadingArray=[]
    
    for ci,C in enumerate(OffenceArray):
        for P in range(ci+1,OffenceArrayLen):
            HeadingArray.append(c2n.upcaseFirstLetter(C)
                                +c2n.upcaseFirstLetter(OffenceArray[P]))
    
    return HeadingArray

Example #3

0

Show file

File: OBOValidation.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDummyDataFrame(Delta=5):
    """ Generate a DataFrame of dummy data to mimic an Emperical Categories frame.
    We want to produce dummy data to test our model selection. We have two trials in one:
    Detect the optimal delta, and detect the optimal partitionings.
    We do this with one data set where we have some small number of slightly different
    distributions, repeated the number of times for our.
    We only need distinct distributions for offences, repeated for all 7 punishments
    
    Arbitrarily we will choose deltat = 5 and three distinct partitions.
     One of size one,
     One of size two,
     One of size seve
    """
    offenceDistributions = [
        [0.05,0.2,0.05,0.05,0.25,0.05,0.05,0.25,0.05], # C,A,C,C,B,C,C,B,C
        [0.25,0.05,0.05,0.05,0.25,0.05,0.05,0.2,0.05], # B,C,C,C,B,C,C,A,C
        [0.05,0.25,0.25,0.05,0.05,0.05,0.05,0.05,0.2], # C,B,B,C,C,C,C,C,A
        [0.05,0.2,0.05,0.05,0.25,0.05,0.05,0.25,0.05]  # C,A,C,C,B,C,C,B,C
        ]
    
    probabilities = []
    for offDis in offenceDistributions:
        probabilities.append([x for x in [z/7 for z in offDis] for y in [1,2,3,4,5,6,7]])
    
    dummyEmp = pd.DataFrame(index=list(range(1674,(1674+4*Delta))), columns=c2n.generateCategories()[1:64])
    
    for offenDist in range(4):
        for delta in range(Delta):
            dummyEmp.iloc[offenDist*Delta+delta] =  discreteRandomSamples(probabilities[offenDist])
    
    return dummyEmp

Example #4

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDataFramesParallel(Date_List, Gender):
    """Generate the dataframes from a gender and datelist"""
    import workerpool
    import json
    import pandas as pd
    
    Gender = 'None'
    DummyFrame = pd.DataFrame(columns=c2n.generateCategories())

    NUM_SOCKETS = 3
    NUM_WORKERS = 5

    # We want a few more workers than sockets so that they have extra
    # time to parse things and such.
    workers = workerpool.WorkerPool(size=NUM_WORKERS)

    class MyJob(workerpool.Job):
        def __init__(self, dAte, Gender):
            self.dAte = dAte
            self.Gender = Gender

        def run(self):
            print('Generating row for {}'.format(self.dAte))
            DummyFrame.loc[len(DummyFrame)] = generateRow(self.dAte,self.Gender,NUM_SOCKETS)
    
    for daTe in Date_List:
        workers.put(MyJob(daTe,Gender))

    # Send shutdown jobs to all threads, and wait until all the jobs have been completed
    # (If you don't do this, the script might hang due to a rogue undead thread.)
    workers.shutdown()
    workers.wait()
    
    return DummyFrame

Example #5

0

Show file

File: OBOModelling.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def deathornot(CatEmp):
    """Create the raw data for death or not as punishment per offence.
    
    Use generateDependentModelLaplace(deathornot) to generate the probability estimates
    """
    
    #Grouping arrays, statically as it is easier to understand. 63 columns, combine by category for Death and Not:
    Groupings = [ c2n.upcaseFirstLetter(x)+y for x in c2n.offcat for y in ['Not']*2 + ['Death'] + ['Not']*4]
    DeathOrNotEmp = CatEmp.groupby(Groupings,axis=1,sort=False).sum()
    
    return DeathOrNotEmp

Example #6

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDataFrames(Date_List, Gender):
    """Generate the dataframes from a gender and datelist"""
    import pandas as pd
    
    DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    for dAte in Date_List:
        print('Generating row for {}'.format(dAte))
        DummyFrame.loc[len(DummyFrame)] = generateRow(dAte,Gender)
        
    return DummyFrame

Example #7

0

Show file

File: OBOValidation.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def validatePartitioning():
    """ Validate the partition searching.
    
    Parameters
    ----------
    DumyEmp : pandas DataFrame, Dummy emperical data frame as generated by generateDummyDataFrame()
    BestPartition: bool, Whether to return the partition AIC scores or the best partition.
    
    Returns:
    pandas DataFrame, Partition AIC scores, or the best partition 
    
    Try also:
    x = [15,85,44,56,49,51,46,54,50,50,50,50,56,44,46,54,4,96] NNNNope
    

    """
    #Death Or Not test
    #For partitions of typ: A, B, A, C, A, A, A, C, A, where we should get
    # Which is [[0,2,4,5,6,8],[1],[3,7]]
    #A = [ 'breakingPeace','deception', 'miscellaneous', 'royalOffences', 'sexual', 'violentTheft']
    #B = [ 'damage' ]
    #C = [ 'kill', 'theft']
    # We also ensure that the occurence of the offences are different within the same partition
    TestRow = [40, 160, 70, 30, 20, 80, 50, 50, 20, 80, 20, 80, 10, 40, 100, 100, 20, 80]
    
    #For not Death or Not, have 7 punishmnets
    A = [ 10, 20, 30, 40, 50, 60, 70 ]
    B = [ 70, 60, 50, 40, 30, 20, 10 ]
    C = [ 20, 10, 40, 30, 60, 50, 70 ]
    #For 9 offences
    #A, B, A, C, A, A, A, C, A as above
    TestFullRowA = A+listMul(B,2)+listMul(A,3)+C+A+A+listMul(A,2)+C+A
    #C, A, A, B, B, C, A, A, C
    TestFullRowB = C+A+listMul(A,2)+B+listMul(B,3)+C+A+listMul(A,3)+C
    
    TestFrame = pd.DataFrame([TestRow,TestRow], columns=list(range(18)), index=[0,1])
    TestFullFrame = pd.DataFrame([TestFullRowA,TestFullRowA,TestFullRowB,TestFullRowB,TestFullRowB], columns=list(range(63)), index=list(range(5)))
    
    partitions = partition.Partition([c2n.upcaseFirstLetter(x) for x in c2n.offcat])
    print('Testing Death Or Not partitioning')
    
    DeathAICtable = oboP.generateAICtable(TestFrame)
    DeathAICmin = DeathAICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)])
    print('Found minimal partitions:')
    print(DeathAICmin)
    
    print('Testing full partitioning')
    AICtable = oboP.generateAICtable(TestFullFrame)
    AICmin = AICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)])
    print('Found minimal partitions:')
    print(AICmin)

Example #8

0

Show file

File: OBOModelling.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def hyphenateCategories(DataFrame):
    """Hyphenate runtogether offence-punishment column labels.
    
    Parameters
    ----------
    DataFrame : pandas DataFrame, a Categories frame with 63 columns
    
    Returns
    -------
    HyphenateFrame : pandas DataFrame, with 9 Offence Category Columns
    """
    DataFrame.columns = c2n.generateCategoriesHyphenated()
    return DataFrame

Example #9

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDataFramesRollingRangeSaving(StartDate, EndDate, DateStep, Delta=1000, Gender='None', SaveEvery=1, File='OBOextract',  Path='Data/Raw/', Overwrite = 1):
    """Generate the dataframes from a gender and datelist
	For dates from StartDate to EndDate stepped through by DateStep,
	data is extracted for the date to date+Delta-1.
	The data is saved in a Pandas DataFrame and cumulativly updated, and saved every SaveEvery retrievals to the file File.
	If Overwrite=1 then the process will attempt to restart from interruption based on the xistingesaeved DataFdramed.dd"""
    import os
    import pandas as pd
    import urllib3
    retry = urllib3.util.Retry(total=100, read=100, connect=100, backoff_factor=1)
    timeout = urllib3.util.Timeout(connect=4.0, read=8.0)
    http=urllib3.PoolManager(retry=retry, timeout=timeout, maxsize=5)

    _Iteration = 0
    
    DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    if Overwrite is 0:
        # Does the file exist
        if os.path.isfile(Path + File+'.csv'):
            try:
                DummyFrame = pd.DataFrame.from_csv(Path + File+'.csv')
                _TempStartDate = int(DummyFrame.tail(1).values[0,0] + DateStep)
                if _TempStartDate >= StartDate:
                    StartDate = _TempStartDate + DateStep
            except:
                DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    for _startDate in range(StartDate,EndDate+1,DateStep):
        _Iteration += 1
        _endDate= _startDate+Delta-1
        print('Generating row for {} to {}, Gender: {}'.format(_startDate,_endDate,Gender))
        DummyFrame.loc[len(DummyFrame)] = generateRowRange(_startDate, _endDate, Gender)
        if _Iteration >= SaveEvery:
            _Iteration = 0
            DummyFrame.to_csv(Path + File+'.csv')
            
    return DummyFrame

Example #10

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDataFrameInChunks(Date_List,Gender,ChunkSize=10, File='OBOextract',  Path='Data/Raw/', Overwrite = 1, Start=0, Stop=0):
    """ Generate DataFrames, but save to csv every so often"""
    import pandas as pd
    
    if Stop <= Start:
        Stop=len(Date_List)
    
    DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    for i in range(Start,Stop,ChunkSize):
        for dAte in Date_List[i:i+ChunkSize-1]:
            print('Generating row for date {}, and gender {}'.format(dAte,Gender))
            DummyFrame.loc[len(DummyFrame)] = generateRow(dAte,Gender)
        DummyFrame.to_csv(File+'.csv')
        
    return DummyFrame

Example #11

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateDataFramesRange(StartDate, EndDate, DateStep, Gender='None'):
    """Generate the dataframes from a gender and datelist"""
    import pandas as pd
    import urllib3
    retry = urllib3.util.Retry(total=1000, read=200, connect=200, backoff_factor=0.5)
    timeout = urllib3.util.Timeout(connect=2.0, read=4.0)
    http=urllib3.PoolManager(retry=retry, timeout=timeout, maxsize=10)
    
    DummyFrame = pd.DataFrame(columns=c2n.generateCategories())
    
    for _startDate in range(StartDate,EndDate+1,DateStep):
        _endDate= _startDate+DateStep-1
        print('Generating row for {} to {}:'.format(_startDate,_endDate))
        DummyFrame.loc[len(DummyFrame)] = generateRowRange(_startDate, _endDate, Gender)
        
    return DummyFrame

Example #12

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def initialiseEmptyArray():
    """Initialise an empty array the same length as the Categories"""
    return [0]*len(c2n.generateCategories())

Example #13

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def initialiseDataFrame():
    """Create an empty dataframe with nothing but categories"""
    import pandas as pd
    return pd.DataFrame(columns=c2n.generateCategories())

Example #14

0

Show file

File: OBapiExtraction.py Project: sharonhoward/OBO-APIextraction-and-Analysis

def generateRowRange(StartDate, EndDate, Gender='None', http='None',Sockets=2):
    """Generate one row of data for a given date and defendent gender
    
    We are not atomising the code to generalise URL gets such that
    the socket opened by urllib3 can stay open. This may be messy 
    looking...
    """
    import CategoryToNumberAssignment as c2n
    import urllib3
    import json
    if http is 'None':
        http=urllib3.PoolManager(maxsize=Sockets)
        
    sStartDate = str(StartDate)
    sEndDate = str(EndDate)
    
    _Columns = c2n.generateCategories()
    
    #Generate empty columns
    Row = [sStartDate]
    
    """Get not breakdown of punishments by category and subcategory for the period"""
    
    # Categories
    for Category in c2n.offcat:
        _TempCategories = initialiseEmptyCatArray()
        
        #Get the Json data
        #print("Generating URL: {}".format(generateURLCategoryRange(sStartDate,sEndDate,Category, Gender)))
        _Json = URLtoJSON(generateURLCategoryRange(sStartDate,sEndDate,Category, Gender),http)
        
        #Find the punishment totals and place in the correct position in the array 
        # adding 1 place for the Not guilty
        for Totals in _Json['breakdown']:
            _TempCategories[c2n.puncat.index(Totals['term'])+1]=Totals['total']
        
        # Append _Temps to the Rows
        Row = Row + _TempCategories
        
    #Associate Offence Subcategories with Punishment Subategories
    for Category in c2n.offsubcat:
        _TempCategories = initialiseEmptySubCatArray()
        
        #Get the Json data
        _Json = URLtoJSON(generateURLSubCategoryRange(sStartDate,sEndDate,Category, Gender),http)
        #Find the punishment totals and place in the correct position in the array 
        # adding 1 place for the Not guilty
        for Totals in _Json['breakdown']:
            _TempCategories[c2n.punsubcat.index(Totals['term'])+1]=Totals['total']
        
        # Append _Temps to the Rows
        Row = Row + _TempCategories
    
    """Get not guilties by category and subcategory for the period"""
    #Get not guilties:
    _JsonNotGuiltyCat = URLtoJSON(generateURLCategoryNotGuiltyRange(sStartDate,sEndDate,Gender),http)
    _JsonNotGuiltySubCat = URLtoJSON(generateURLSubCategoryNotGuiltyRange(sStartDate,sEndDate,Gender),http)
    
    # Place values associated with locations in Row:
    for Totals in _JsonNotGuiltyCat['breakdown']:
        Row[_Columns.index(c2n.upcaseFirstLetter(Totals['term'])+'NotGuilty')] = Totals['total']
        
    for Totals in _JsonNotGuiltySubCat['breakdown']:
        Row[_Columns.index(c2n.upcaseFirstLetter(Totals['term'])+'NotGuilty')] = Totals['total']
        
    return Row

Example #15

0

Show file

File: OBOPartitioning.py Project: sharonhoward/OBO-APIextraction-and-Analysis

import pandas as pd
import numpy as np
from ast import literal_eval
from partitionsets import partition
import CategoryToNumberAssignment as c2n
import OBOModelling as oboM

#The following dependency is from CythonGSL from https://github.com/twiecki/CythonGSL
# The interfaces must be installed as:
#  sudo python3 setup_interface.py install
# in the CythonGSL directory. (you will need gcc and libgsl-dev or equivalent installed)
import probability_distribution as gslPDD

Deltas = [1,2,3,4,5,10,50,100,240]
#Initilise partitions:
partitions = partition.Partition([c2n.upcaseFirstLetter(x) for x in c2n.offcat])
partitioN = partition.Partition(list(range(0,9)))

def partitionAIC(EmpFrame, part, OffenceEstimateFrame = [], ReturnDeathEstimate=False, BlockPunishment='Death', Verbose=True):
    """Calculate AIC score between the EmpFrame and the model where offences are partitioned as `part'.
    
    Parameters:
    -----------
        EmpFrame : DataFrame
            DataFrame of emperical data, pre processed, maybe.
        part : nested list, 2 levels
            Partition formatted as:  [[0, 3], [1, 2, 6, 7], [4, 5, 8]]
        ReturnDeathEstimate : bool
            Whether to return the DeathEstimate frame
        
    Returns: