Python sumBy Examples, util.sumBy Python Examples

Example #1

0

Show file

File: testCorrCCG.py Project: rachelboy/NHSDataScience

def plotSlopeHists(data):
	summed = util.sumBy(data,'CCG')
	getPercents(summed)
	slopes = findSlopes(summed)
	pyplot.hist(slopes, bins = 50)
	pyplot.title('Actual distribution of rate of change \n of diclophenac prescription',
		fontsize = 18)
	pyplot.xlabel('rate of change of percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('number of CCGs', fontsize = 14)
	pyplot.axis([-.06, .08, 0, 38])
	pyplot.show()

	CCGs = data.to_dict(outtype='list')['CCG']
	random.shuffle(CCGs)
	data['CCG'] = CCGs
	summed = util.sumBy(data,'CCG')
	getPercents(summed)
	slopes = findSlopes(summed)
	pyplot.hist(slopes, bins = 50)
	pyplot.title('Example distribution of rate of change \n of diclofenac prescription without \ngovernance or geographic effect',
		fontsize = 18)
	pyplot.xlabel('rate of change of percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('number of CCGs', fontsize = 14)
	pyplot.axis([-.06, .08, 0, 38])
	pyplot.show()

Example #2

0

Show file

File: testCorrCCG.py Project: rachelboy/NHSDataScience

def plotFinalHists(data):
	data['days_prescribed_diclofenac'] = data['days_prescribed_diclofenac'].map(lambda x: 0 if x!=x else x)
	data['days_prescribed_naproxen'] = data['days_prescribed_naproxen'].map(lambda x: 0 if x!=x else x)
	summed = util.sumBy(data,'CCG')
	summed['perc'] = summed['days_prescribed_diclofenac']/(summed['days_prescribed_naproxen']+summed['days_prescribed_diclofenac'])
	pyplot.hist(summed['perc'], bins = 100)
	pyplot.title('Actual distribution of rate of diclophenac prescription',
		fontsize = 18)
	pyplot.xlabel('percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('number of CCGs', fontsize = 14)
	pyplot.axis([0, 1, 0, 30])
	pyplot.show()

	CCGs = data.to_dict(outtype='list')['CCG']
	random.shuffle(CCGs)
	data['CCG'] = CCGs
	summed = util.sumBy(data,'CCG')
	summed['perc'] = summed['days_prescribed_diclofenac']/(summed['days_prescribed_naproxen']+summed['days_prescribed_diclofenac'])
	pyplot.hist(summed['perc'], bins = 100)
	pyplot.title('Example distribution of rate of diclofenac \nprescription without governance or geographic effect',
		fontsize = 18)
	pyplot.xlabel('percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('number of CCGs', fontsize = 14)
	pyplot.axis([0, 1, 0, 30])
	pyplot.show()

Example #3

0

Show file

File: write_ratio_dataset.py Project: rachelboy/NHSDataScience

	def run(self):
		bfile = self.Config.append_dir("WriteRatioDatasetBrand")
		gfile = self.Config.append_dir("WriteRatioDatasetGeneric")
		ofile = self.Config.append_dir("WriteRatioDatasetOut")

		for (brand, generic, outfile) in zip(bfile,gfile,ofile):
			brandfile = self.loadDF(brand)
			# if not brandfile:
			# 	continue
			genericfile = self.loadDF(generic)
			# if not genericfile:
			# 	continue
			
			groupedbrand = util.sumBy(brandfile,[self.Config.keys['practice'],'postal code'])
			groupedgeneric = util.sumBy(genericfile,[self.Config.keys['practice'],'postal code'])
			
			output = pandas.DataFrame.merge(
				groupedbrand, 
				groupedgeneric, 
				on=self.Config.keys['practice'],
				how = 'outer', 
				suffixes =('brand','generic'))
			

			items = self.Config.keys['items']
			quan = self.Config.keys['quantity']
			nic = self.Config.keys['nic']
			parts = [[items,quan,nic],['brand','generic']]
			drops = []

			for a in parts[0]:
				for b in parts[1]:
					output[a+b] = output[a+b].map(lambda x: 0 if x!=x else x)
					drops.append(a+b)
			drops = drops+['INCLUDEbrand','GENERICbrand','INCLUDEgeneric',
					'GENERICgeneric','postal codebrand','postal codegeneric']

			output['postal code'] = output.apply(
				lambda row: row['postal codegeneric'] 
				if row['postal codebrand']!=row['postal codebrand'] 
				else row['postal codebrand'],
				axis=1)

			output['sumitems'] = output[items+'brand']+output[items+'generic']
			output['sumquantity'] = output[quan+'brand']+output[quan+'generic']
			output['sumnic'] = output[nic+'brand']+output[nic+'generic']

			output['ratioitems']= output[items+'brand']/output['sumitems']
			output['ratioquantity'] = output[quan+'brand']/output['sumquantity']
			output['rationic'] = output[nic+'brand']/output['sumnic']
			
			output = output.drop(drops,axis=1)

			output.to_csv(outfile, index = False)

Example #4

0

Show file

File: write_ratio_dataset.py Project: rachelboy/NHSDataScience

	def run(self):
		infiles = self.Config.append_dir("OutCodeRatiosIn")
		outfiles = self.Config.append_dir("OutCodeRatiosOut")

		outcodes = self.loadDF('postcodes.csv')
		if not outcodes:
			return

		for infile,outfile in zip(infiles,outfiles):
			data = self.loadDF(infile)
			if not data:
				continue

			labels = ['items','quantity','nic']
			for a in labels:
				data['tot'+a] = data['sum'+a]*data['ratio'+a]
			print data
			print data['postal code']
			data['outcode'] = data['postal code'].map(lambda x: x.partition(' ')[0])
			data = util.sumBy(data,'outcode')
			for a in labels:
				data['ratio'+a] = data['tot'+a]/data['sum'+a]
				data.drop('tot'+a,axis=1)
			data = pandas.DataFrame.merge(data,outcodes,on='outcode',how='left')
			data.to_csv(outfile)

Example #5

0

Show file

File: time_calcs.py Project: rachelboy/NHSDataScience

def calc_drug_over_time(Config):
	folder = 'SepGeneric'
	infiles = Config.append_dir(folder)

	quantity = {}
	nic = {}

	for month in infiles:
		df = loadDF(month)
		month = month[-11:-4]
		grouped = util.sumBy(df,Config.keys['bnf'])

		for index,row in grouped.iterrows():
			item =row[Config.keys['bnf']]
			
			if item in quantity.keys():
				quantity[item][month] = (row[Config.keys['quantity']])#,row[Config.keys['nic']])
				nic[item][month] = (row[Config.keys['nic']])#,row[Config.keys['nic']])

			else:
				quantity[item] = {}
				quantity[item][month] = (row[Config.keys['quantity']])#,row[Config.keys['nic']])
				
				nic[item] = {}
				nic[item][month] = (row[Config.keys['nic']])#,row[Config.keys['nic']])

	return quantity,nic

Example #6

0

Show file

File: nsaid_pipeline.py Project: rachelboy/NHSDataScience

	def calc_drug_over_time(self, grouping = 'practice'):
		folder = 'NSAIDSummed'
		infiles = self.Config.append_dir(folder)

		naproxen = {}
		diclofenac = {}

		for month in infiles:
			df = self.loadDF(month)
			if grouping == 'CCG':
				df = util.sumBy(df,self.Config.keys['ccg'])
				time.sleep(20)
			month = month[-11:-4]

			for index,row in df.iterrows():

				if grouping == 'CCG':
					item =row[self.Config.keys['ccg']]
				else:
					item = row[self.Config.keys['practice']]

				if item in naproxen.keys():
					naproxen[item][month] = row['days_prescribed_naproxen']
					diclofenac[item][month] = row['days_prescribed_diclofenac']
				else:
					naproxen[item] = {}
					naproxen[item][month] = (row['days_prescribed_naproxen'])
					
					diclofenac[item] = {}
					diclofenac[item][month] = (row['days_prescribed_diclofenac'])

		return naproxen,diclofenac

Example #7

0

Show file

File: testCorrCCG.py Project: rachelboy/NHSDataScience

def findSlopeMeanVar(base):
	summed = util.sumBy(base,'CCG')

	getPercents(summed)
	slopes = findSlopes(summed)

	return ts.MeanVar(slopes)

Example #8

0

Show file

File: makeDrugPairs.py Project: rachelboy/NHSDataScience

	def run(self):
		infiles = self.Config.append_dir("OutCodeDrugsIn")
		outfiles = self.Config.append_dir("OutCodeDrugsOut")

		outcodes = self.loadDF('postcodes.csv')
		if not outcodes:
			return

		for infile,outfile in zip(infiles,outfiles):
			data = self.loadDF(infile)
			if not data:
				continue

			data['outcode'] = data['postal code'].map(lambda x: x.partition(' ')[0])
			data = util.sumBy(data,['outcode','ChemID'])

			items = self.Config.keys['items']
			quan = self.Config.keys['quantity']
			nic = self.Config.keys['nic']
			cols = [items,quan,nic]

			for col in cols:
				data['sum'+col] = data[col+'_brand']+data[col+'_gen']
				data['percent'+col]= data[col+'_brand']/data['sum'+col]
				data = data.drop([col+'_brand',col+'_gen'],axis=1)
			data = pandas.DataFrame.merge(data,outcodes,on='outcode',how='left')
			data.to_csv(outfile, index=False)

Example #9

0

Show file

File: testCorrs.py Project: rachelboy/NHSDataScience

 def plotRegression(self):
     lines = self.loadDF("Results/CostQuanCorrs_PPI.csv")
     lines = lines.groupby("Group")
     for name, group in lines:
         group = group.groupby("Month")
         for month, fits in group:
             data = self.loadDF(name + "/" + month + ".csv")
             data = util.sumBy(data, self.Config.keys["bnf"])
             data["cost"] = data[Config.keys["nic"]] / data[Config.keys["quantity"]]
             data = data.to_dict(outtype="list")
             pp.scatter(data["cost"], data[self.Config.keys["items"]])
             xs = [x / 20.0 for x in range(30)]
             for index, row in fits.iterrows():
                 if row["transform"] == "linear":
                     ys = [x * row["slope"] + row["inter"] for x in xs]
                     pp.plot(xs, ys, "-.", label="linear")
                 if row["transform"] == "log":
                     ys = [math.exp(x * row["slope"] + row["inter"]) for x in xs]
                     pp.plot(xs, ys, "-", label="log")
             pp.legend()
             pp.title(name + " " + month)
             pp.ylabel("items")
             pp.xlabel("cost")
             pp.yscale("log")
             pp.show()

Example #10

0

Show file

File: testCorrs.py Project: rachelboy/NHSDataScience

 def plotAllTime(self):
     lines = self.loadDF("Results/CostQuanCorrs_PPI_AllTime.csv")
     lines = lines.groupby("Group")
     for name, group in lines:
         xs = []
         ys = []
         for month in self.Config.filenames:
             data = self.loadDF(name + "/" + month)
             data = util.sumBy(data, self.Config.keys["bnf"])
             data["cost"] = data[Config.keys["nic"]] / data[Config.keys["quantity"]]
             data = data.to_dict(outtype="list")
             xs = xs + data["cost"]
             ys = ys + data[self.Config.keys["items"]]
         pp.scatter(xs, ys)
         fxs = [x / 20.0 for x in range(30)]
         for index, row in group.iterrows():
             if row["transform"] == "linear":
                 fys = [x * row["slope"] + row["inter"] for x in fxs]
                 pp.plot(fxs, fys, "-.", label="linear")
             if row["transform"] == "log":
                 fys = [math.exp(x * row["slope"] + row["inter"]) for x in fxs]
                 pp.plot(fxs, fys, "-", label="log")
         pp.legend()
         pp.title(name)
         pp.ylabel("items")
         pp.xlabel("cost")
         pp.yscale("log")
         pp.xscale("log")
         pp.show()

Example #11

0

Show file

File: psychoticPipline.py Project: rachelboy/NHSDataScience

def chemGenBrandComp(Config):
    infiles = Config.append_dir("Summed_Chem_Gen", group="psych")
    antipsych = pandas.read_csv("Criteria/antipsychotics.csv", index_col="chem code")

    data = {}
    for i, r in antipsych.iterrows():
        data[r["name"]] = {"brand": [0 for i in range(len(infiles))], "gen": [0 for i in range(len(infiles))]}

    mon = 0
    for infile in infiles:
        print "Loading", infile
        df = pandas.read_csv(infile, index_col=False)

        df = util.sumBy(df, ["chem code", "name", "generic"])
        for i, r in df.iterrows():
            if r["generic"] == 1:
                gen = "gen"
            else:
                gen = "brand"
            tot = r[Config.keys["items"]]
            data[r["name"]][gen][mon] = tot

        mon += 1

    for key, value in data.items():
        if np.sum(value["brand"]) > 0 or np.sum(value["gen"]) > 0:
            pp.plot(value["brand"], "r*-", label="brand")
            pp.plot(value["gen"], "bo-", label="generic")

            pp.legend()
            pp.title("Total prescriptions of " + key)
            pp.ylabel("# prescriptions")
            pp.xlabel("months since Jan 2012")
            pp.show()

Example #12

0

Show file

File: nsaid_pipeline.py Project: rachelboy/NHSDataScience

	def run(self):
		infiles = self.Config.append_dir("Sum_by_ccg_in", group = 'NSAID')
		outfiles = self.Config.append_dir("Sum_by_ccg_out", group = 'NSAID')
		for (infile,outfile) in zip(infiles,outfiles):
			df = self.loadDF(infile)
			data = util.sumBy(df,[self.Config.keys['ccg']])
			data['sum'] = data['days_prescribed_naproxen']+data['days_prescribed_diclofenac']
			data['percent']= data['days_prescribed_diclofenac']/data['sum']
			data.to_csv(outfile, index = False)

Example #13

0

Show file

File: testCorrCCG.py Project: rachelboy/NHSDataScience

def plotFinalCDF(data):
	data['days_prescribed_diclofenac'] = data['days_prescribed_diclofenac'].map(lambda x: 0 if x!=x else x)
	data['days_prescribed_naproxen'] = data['days_prescribed_naproxen'].map(lambda x: 0 if x!=x else x)
	summed = util.sumBy(data,'CCG')
	summed['perc'] = summed['days_prescribed_diclofenac']/(summed['days_prescribed_naproxen']+summed['days_prescribed_diclofenac'])
	cdf1 = ts.MakeCdfFromList(summed['perc'])
	pmf1 = ts.EstimatedPdf(summed['perc']).MakePmf([x/50.0 for x in range(51)])
	

	CCGs = data.to_dict(outtype='list')['CCG']
	random.shuffle(CCGs)
	data['CCG'] = CCGs
	summed = util.sumBy(data,'CCG')
	summed['perc'] = summed['days_prescribed_diclofenac']/(summed['days_prescribed_naproxen']+summed['days_prescribed_diclofenac'])
	cdf2 = ts.MakeCdfFromList(summed['perc'])
	pmf2 = ts.EstimatedPdf(summed['perc']).MakePmf([x/50.0 for x in range(51)])

	pyplot.subplot(1,2,1)
	xs,ys = cdf1.Render()
	pyplot.plot(xs,ys,'r-',linewidth=2,label = 'Actual')
	xs,ys = cdf2.Render()
	pyplot.plot(xs,ys,'b--',linewidth=2,label = 'No CCG correlation')

	pyplot.title('CDF')
	# pyplot.title('Actual distribution of percent diclofenac prescribed\ncompared to distribution with no CCG correlation',
		# fontsize = 18)
	pyplot.xlabel('percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('fraction of CCGs at or below percentage', fontsize = 14)
	pyplot.legend(loc=4)
	pyplot.axis([0, 1, 0, 1])
	
	pyplot.subplot(1,2,2)
	xs,ys = pmf1.Render()
	pyplot.plot(xs,ys,'r-',linewidth=2,label = 'Actual')
	xs,ys = pmf2.Render()
	pyplot.plot(xs,ys,'b--',linewidth=2,label = 'No CCG correlation')

	pyplot.title('PDF')
	# pyplot.title('Actual distribution of percent diclofenac prescribed\ncompared to distribution with no CCG correlation',
		# fontsize = 18)
	pyplot.xlabel('percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('probability density', fontsize = 14)
	pyplot.legend(loc=1)
	pyplot.show()

Example #14

0

Show file

File: nsaid_pipeline.py Project: rachelboy/NHSDataScience

	def run(self):
		na_infiles = self.Config.append_dir("Sum_by_practice_in_nap", group='NSAID')
		dic_infiles = self.Config.append_dir("Sum_by_practice_in_dic", group = 'NSAID')
		outfiles = self.Config.append_dir("Sum_by_practice_out", group = 'NSAID')
	

		for (na_infile, dic_infile, outfile) in zip(na_infiles,dic_infiles, outfiles):
			na_df = self.loadDF(na_infile)
			dic_df = self.loadDF(dic_infile)

			grouped_na = util.sumBy(na_df,[self.Config.keys['practice'], self.Config.keys['ccg'],self.Config.keys['pct']])
			grouped_dic = util.sumBy(dic_df,[self.Config.keys['practice'], self.Config.keys['ccg'],self.Config.keys['pct']])
			
			output = pandas.DataFrame.merge(
				grouped_na, 
				grouped_dic, 
				on=[self.Config.keys['practice'], self.Config.keys['ccg'], self.Config.keys['pct']],
				how = 'outer', 
				suffixes =('_naproxen','_diclofenac'))
			print output
	
			output.to_csv(outfile, index = False)

Example #15

0

Show file

File: testCorrCCG.py Project: rachelboy/NHSDataScience

def plotSlopeCDF(data):
	summed = util.sumBy(data,'CCG')
	getPercents(summed)
	slopes = findSlopes(summed)
	cdf1 = ts.MakeCdfFromList(slopes)
	pmf1 = ts.EstimatedPdf(slopes).MakePmf([x/200.0 for x in range(-12,21)])
	
	CCGs = data.to_dict(outtype='list')['CCG']
	random.shuffle(CCGs)
	data['CCG'] = CCGs
	summed = util.sumBy(data,'CCG')
	getPercents(summed)
	slopes = findSlopes(summed)
	cdf2 = ts.MakeCdfFromList(slopes)
	pmf2 = ts.EstimatedPdf(slopes).MakePmf([x/200.0 for x in range(-12,21)])

	pyplot.subplot(1,2,1)
	xs,ys = cdf1.Render()
	pyplot.plot(xs,ys,'r-',linewidth=2,label = 'Actual')
	xs,ys = cdf2.Render()
	pyplot.plot(xs,ys,'b--',linewidth=2,label='No CCG correlation')

	pyplot.xlabel('rate of change of percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('fraction of CCGs at or below rate', fontsize = 14)
	pyplot.legend(loc=4)

	pyplot.subplot(1,2,2)
	xs,ys = pmf1.Render()
	pyplot.plot(xs,ys,'r-',linewidth=2,label = 'Actual')
	xs,ys = pmf2.Render()
	pyplot.plot(xs,ys,'b--',linewidth=2,label = 'No CCG correlation')

	# pyplot.title('Actual distribution of rate of change\nof diclophenac prescription compared to\ndistribtuion without any CCG correlation ',
	# 	fontsize = 18)
	pyplot.xlabel('rate of change of percent diclofenac prescribed', fontsize = 14)
	pyplot.ylabel('probability density', fontsize = 14)
	pyplot.legend(loc=1)
	pyplot.show()

Example #16

0

Show file

File: summarizeDists.py Project: rachelboy/NHSDataScience

def plotCostVQuantity(filenames, Config):
    for datafile in datafiles:
        print datafile

        df = pandas.read_csv(datafile)
        df = util.sumBy(df, Config.keys["bnf"])

        df["ITEM COST"] = df[Config.keys["nic"]] / df[Config.keys["quantity"]]

        plotTransform(df, "ITEM COST", Config.keys["quantity"], np.log, "Log")
        inter, slope = findCorr(df, "ITEM COST", Config.keys["quantity"], np.log)
        ys = [inter + slope * x for x in np.log(df["ITEM COST"])]
        pp.scatter(np.log(df["ITEM COST"]), ys, color="red")
        pp.show()

Example #17

0

Show file

File: testCorrs.py Project: rachelboy/NHSDataScience

    def runAllTime(self, items_cutoff=0, cost_cutoff=0):
        out = {}
        infolders = self.Config.directories["CorrsIn"]

        for folder in infolders:
            infiles = self.Config.append_dir(folder)
            xs = []
            ys = []
            for infile in infiles:
                data = self.loadDF(infile)
                data = util.sumBy(data, self.Config.keys["bnf"])
                data["cost"] = data[self.Config.keys["nic"]] / data[self.Config.keys["quantity"]]
                data = data[data[self.Config.keys["items"]] > items_cutoff]
                data = data[data["cost"] > cost_cutoff]
                data = data.to_dict(outtype="list")
                xs = xs + data["cost"]
                ys = ys + data[self.Config.keys["items"]]
            corr = ts.SpearmanCorr(xs, ys)
            pVal = self.PValue(xs, ys, actual=corr, n=1000)
            sdev, serr, inter, slope = self.regress(xs, ys)

            out["Group"] = out.get("Group", []) + [folder]
            out["transform"] = out.get("transform", []) + ["linear"]
            out["Scorr"] = out.get("Scorr", []) + [corr]
            out["p"] = out.get("p", []) + [pVal]
            out["Stan Dev"] = out.get("Stan Dev", []) + [sdev]
            out["Stan Err"] = out.get("Stan Err", []) + [serr]
            out["inter"] = out.get("inter", []) + [inter]
            out["slope"] = out.get("slope", []) + [slope]

            sdev, serr, inter, slope = self.regress(xs, ys, ylog=True)

            out["Group"] = out.get("Group", []) + [folder]
            out["transform"] = out.get("transform", []) + ["log"]
            out["Scorr"] = out.get("Scorr", []) + [corr]
            out["p"] = out.get("p", []) + [pVal]
            out["Stan Dev"] = out.get("Stan Dev", []) + [sdev]
            out["Stan Err"] = out.get("Stan Err", []) + [serr]
            out["inter"] = out.get("inter", []) + [inter]
            out["slope"] = out.get("slope", []) + [slope]

            # pp.scatter(xs,ys)
            # pp.yscale('log')
            # pp.show()

        data = pandas.DataFrame(out, index=None)
        data.to_csv("Results/CostQuanCorrs_PPI_AllTime.csv")

Example #18

0

Show file

File: makeDrugPairs.py Project: rachelboy/NHSDataScience

	def run(self):
		infiles = self.Config.append_dir("MakeDrugPairsIn")
		outfiles = self.Config.append_dir("MakeDrugPairsOut")

		for (infile,outfile) in zip(infiles,outfiles):
			data = self.loadDF(infile)
			if not data:
				continue
			
			data['ChemID'] = data[self.Config.keys['bnf']].map(lambda x: x[0:9])
			data = util.sumBy(data,[self.Config.keys['practice'],'ChemID',self.Config.keys['gen'],'postal code'])
			
			grouped = pandas.groupby(data,self.Config.keys['gen'])
			data = pandas.merge(grouped.get_group(1.0),grouped.get_group(0.0),
				on =[self.Config.keys['practice'],'ChemID'], 
				left_index=False, 
				right_index = False,
				how = 'outer',
				sort = False,
				suffixes = ('_gen','_brand'))

			for col in data.columns.values.tolist():
				data[col] = data[col].map(lambda x: 0 if x!=x else x)

			data['postal code'] = data.apply(
				lambda row: row['postal code_gen'] 
				if row['postal code_brand']!=row['postal code_brand'] 
				else row['postal code_brand'],
				axis=1)

			items = self.Config.keys['items']
			quan = self.Config.keys['quantity']
			nic = self.Config.keys['nic']
			cols = [items,quan,nic]

			for col in cols:
				data['sum'+col] = data[col+'_brand']+data[col+'_gen']
				data['percent'+col]= data[col+'_brand']/data['sum'+col]

			data = data.drop(['INCLUDE_gen','INCLUDE_brand',
				'GENERIC_gen','GENERIC_brand',
				'postal code_gen','postal code_brand'], 
				axis=1)

			data.to_csv(outfile, index = False)

Example #19

0

Show file

File: psychoticPipline.py Project: rachelboy/NHSDataScience

def stackPlot(Config):
    infiles = Config.append_dir("Ingest_out", group="psych")
    antipsych = pandas.read_csv("Criteria/antipsychotics.csv", index_col="chem code")

    data = {}
    for i, r in antipsych.iterrows():
        data[r["name"]] = []

    for infile in infiles:
        print "Loading", infile
        df = pandas.read_csv(infile)

        df = util.sumBy(df, ["chem code", "name"])
        df = df.set_index("chem code")
        for i, r in antipsych.iterrows():
            try:
                # relies on chem code being index
                tot = df.loc[i][Config.keys["items"]]
            except KeyError:
                tot = 0
            if tot != tot:
                tot = 0
            data[r["name"]].append(tot)

    prev = None
    lines = []
    legends = []
    for key, value in sorted(data.items(), key=lambda x: numpy.mean(x[1])):
        if prev:
            prev = sumLists(prev, value)
        else:
            prev = value
        if numpy.mean(value) < 1000:
            pp.plot(prev)
        else:
            lines = pp.plot(prev) + lines
            legends = [key] + legends

    pp.legend(lines, legends)
    pp.title("Cumulative presecriptions of antipsychotics")
    pp.ylabel("# prescriptions")
    pp.xlabel("months since Jan 2012")
    pp.show()

Example #20

0

Show file

File: SummStats.py Project: rachelboy/NHSDataScience

	def run(self):
		for files, dirname in zip(self.inputs,self.inDirs):
			dists = {}
			for datafile in files:
				date = datafile[-11:-4]

				df = self.loadDF(datafile)
				df = util.sumBy(df,self.Config.keys['bnf'])

				for distFun in self.distFuns:
					res = distFun[1](df)
					res['Date'] = date

					dists[distFun[0]] = dists.get(distFun[0],[]) + [res]

			for distFun in self.distFuns:
				outfile = self.outDir+'/'+dirname+'_'+distFun[0]+'_SummaryStats.csv'
				df = pandas.DataFrame(dists[distFun[0]])
				df.to_csv(outfile,index=None)

Example #21

0

Show file

File: nsaid_pipeline.py Project: rachelboy/NHSDataScience

	def run(self):
		infiles = self.Config.append_dir("Sum_by_practice_out", group='NSAID')	
		outfiles = self.Config.append_dir("NSAIDGov")
		pctNames = pandas.read_csv("Criteria/pctCodeToName.csv")

		for infile, outfile in zip(infiles,outfiles):
			df = self.loadDF(infile)

			output = util.sumBy(df,[self.Config.keys['ccg'],self.Config.keys['pct']])
			dd = output['days_prescribed_diclofenac']
			dn = output['days_prescribed_naproxen']
			output['percent'] = dd/(dd+dn)

			out = pandas.DataFrame.merge(
				output, 
				pctNames, 
				left_on=self.Config.keys['pct'],
				right_on = 'code',
				how = 'outer')

			out.to_csv(outfile, index = False)

Example #22

0

Show file

File: makeDrugPairs.py Project: rachelboy/NHSDataScience

	def run(self):
		infiles = self.Config.append_dir("AllDrugsIn")
		outfiles = self.Config.append_dir("AllDrugsOut")

		for infile,outfile in zip(infiles,outfiles):
			data = self.loadDF(infile)
			if not data:
				continue

			data = util.sumBy(data,['ChemID'])

			items = self.Config.keys['items']
			quan = self.Config.keys['quantity']
			nic = self.Config.keys['nic']
			cols = [items,quan,nic]

			for col in cols:
				data['sum'+col] = data[col+'_brand']+data[col+'_gen']
				data['percent'+col]= data[col+'_brand']/data['sum'+col]
				data = data.drop([col+'_brand',col+'_gen'],axis=1)

			data.to_csv(outfile, index=False)

Example #23

0

Show file

File: DataSummaryPlots.py Project: rachelboy/NHSDataScience

	# 		data[row['CCG']] = [row['percent']] + data.get(row['CCG'],[]) 
	# for key, value in data.items():
	# 	pyplot.plot(value)
	# '''
	# pyplot.plot(data['mean'], label = 'average')
	# pyplot.plot(data['dev_up'],'.', label = '1 std dev up')
	# pyplot.plot(data['dev_down'],'.', label = '1 std dev down')
	# pyplot.legend()
	# pyplot.title('Averge percent diclofenac over all CCGs (Jan 2012 to Oct 2013)')
	# pyplot.ylabel('Percent diclofenac')
	# pyplot.xlabel('Months since Jan 2012')
	# pyplot.show()

	infile = Config.nsaid_directories['Ingest_out_nap']+'/Oct2013.csv'
	data = pandas.read_csv(infile)
	data = util.sumBy(data,Config.keys['bnf'])
	plotLogCDF(data,
		'days_prescribed',
		'Log(x) CDF of days prescribed for NSAID preparations (Oct 2013)',
		'','Days prescribed')


	# infile = Config.directories['Ingest_out']+'/Oct2013.csv'
	# data = pandas.read_csv(infile)
	# data = util.sumBy(data,Config.keys['bnf'])
	# plotLogCDF(data,
	# 	Config.keys['items'],
	# 	'Log(x) CDF of items for all drug preparations (Oct 2013)',
	# 	'','Prescriptions filled')
	# data['cost'] = data[Config.keys['nic']]/data[Config.keys['quantity']]
	# plotLogCDF(data,

Example #24

0

Show file

File: psychoticPipline.py Project: rachelboy/NHSDataScience

 def process(self, df):
     return util.sumBy(df, ["chem code", "name", "CCG", "PCT", "generic"])

Example #25

0

Show file

File: plotThings.py Project: rachelboy/NHSDataScience

	mkPlotMedianMean(data2,"blue",name=data2label)

def mkPlotVersus(df,x,y,**args):
	data = df.to_dict(outtype='list')
	pp.scatter(data[x],data[y],**args)
	

if __name__ =="__main__":
	Config = config.Config()

	'''PPI items vs. cost'''
	for brand, gen in zip(Config.append_dir('SepBrand'),Config.append_dir('SepGeneric')):
		df_Brand = pandas.read_csv(brand)
		df_Gen = pandas.read_csv(gen)

		df_Brand = util.sumBy(df_Brand,Config.keys['bnf'])
		df_Gen = util.sumBy(df_Gen,Config.keys['bnf'])

		df_Brand['cost'] = df_Brand[Config.keys['nic']]/df_Brand[Config.keys['quantity']]
		df_Gen['cost'] = df_Gen[Config.keys['nic']]/df_Gen[Config.keys['quantity']]
		
		mkPlotVersus(df_Brand,'cost',Config.keys['items'],marker='x',s=35,color="red",label="Brand")
		mkPlotVersus(df_Gen,'cost',Config.keys['items'],marker = '*',s=35,color="blue",label="Generic")

		pp.title(brand)
		# pp.title('PPIs (Oct 2013)')
		pp.ylabel('Number of prescriptions for drug')
		pp.xlabel('Drug Cost')
		# pp.legend()
		# pp.yscale('log')
		pp.show()

Example #26

0

Show file

File: totals.py Project: rachelboy/NHSDataScience

def makeDrugSums(filename,Config):
	'''returns dataframe from file, aggregated by bnf code'''
	df = pandas.read_csv(filename)
	df = util.sumBy(df,Config.keys['bnf'])
	return df

Example #27

0

Show file

File: totals.py Project: rachelboy/NHSDataScience

    return data


if __name__ == "__main__":
	Config = config.TestConfig()
	'''
	data = makeDrugSums('CompressedData/Oct2013.csv',Config)
	data['cost'] = data[Config.keys['nic']]/data[Config.keys['quantity']]
	
	'''
	data = pandas.read_csv('RatioDataset/Oct2013.csv')
	labels = ['items','quantity','nic']
	for a in labels:
		data['tot'+a] = data['sum'+a]*data['ratio'+a]
	#data['outcode'] = data['postal code'].map(lambda x: x.partition(' ')[0])
	data = util.sumBy(data,Config.keys['practice'])
	for a in labels:
		data['ratio'+a] = data['tot'+a]/data['sum'+a]
		data.drop('tot'+a,axis=1)
	
	plotLogNormal(data,'ratioitems',res=4000)
	plotLogNormal(data,'rationic',res = 4000)



	'''
	print 'All Drugs (Cost)'
	df = makeDrugSums("CompressedData/Oct2013.csv", Config)
	df['cost'] = df[Config.keys['nic']]/df[Config.keys['quantity']]
	util.plotLogNormal(df,'cost')

Example #28

0

Show file

File: testCorrCCG.py Project: rachelboy/NHSDataScience

def findFinalMeanVar(data):
	summed = util.sumBy(data,'CCG')
	summed['perc'] = summed['days_prescribed_diclofenac']/(summed['days_prescribed_naproxen']+summed['days_prescribed_diclofenac'])
	
	return ts.MeanVar(summed['perc'])