def publish(self): if not 'publish' in self.rules: return publish_rules = self.rules['publish'] if not 'repository' in publish_rules or not 'name' in publish_rules: ap("Nowhere to publish..") return 1 build_tag = self.build_tag() repository = publish_rules['repository'] name = publish_rules['name'] ap("Tagging") tag_cmd = 'docker tag %(build_tag)s %(repository)s/%(name)s' % locals() ap(tag_cmd) tag_result = call(tag_cmd.split()) if tag_result > 0: return tag_result ap("Publishing") publish_cmd = 'docker push %(repository)s/%(name)s' % locals() ap(publish_cmd) return call(publish_cmd.split())
def plot_variable(data,basepath=None,dataname='',criterion=None, criterionname=[]): fig = plt.figure() ax = fig.add_subplot(111) x = range(data.shape[1]) ap('Plotting %s'%dataname) if criterion != None: if type(criterion) != list: median, lq, uq = perc(data[criterion,:]) ax.plot(x,median,linewidth=2, color='#B22400') ax.fill_between(x, lq, uq, alpha=0.25, linewidth=0, color='#B22400') else: bmap = brewer2mpl.get_map('Set2', 'qualitative', 7) colors = bmap.mpl_colors for i,(x_criterion,x_label) in enumerate(itertools.izip_longest(criterion,criterionname,fillvalue='Group')): median, lq, uq = perc(data[x_criterion,:]) ax.plot(x,median,linewidth=2, color=colors[i], label=artist.format(x_label)) ax.fill_between(x, lq, uq, alpha=0.25, linewidth=0, color=colors[i]) median, lq, uq = perc(data) ax.plot(x,median,linewidth=2, color='#B22400',label=artist.format('Full population')) ax.fill_between(x, lq, uq, alpha=0.25, linewidth=0, color='#B22400') artist.adjust_spines(ax) ax.set_ylabel(artist.format(dataname)) ax.set_xlabel(artist.format('Time')) ax.axvline(data.shape[1]/3,color='r',linewidth=2,linestyle='--') ax.axvline(2*data.shape[1]/3,color='r',linewidth=2,linestyle='--') plt.legend(frameon=False,loc='lower left') plt.tight_layout() plt.savefig(os.path.join(basepath,'%s.png'%dataname))
def main(argv): input_corpus_directory = '' output_corpus_directory = '' try: print(argv) opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print 'Usage is: python obfuscation_software.py -i <input_corpus_directory> -o <output_corpus_directory>' sys.exit(2) for opt, arg in opts: if opt in ("-i", "--ifile"): input_corpus_directory = arg elif opt in ("-o", "--ofile"): output_corpus_directory = arg problem_directories = glob.glob(os.path.join(input_corpus_directory, "*")) if (os.path.exists(output_corpus_directory)): shutil.rmtree(output_corpus_directory) os.mkdir(output_corpus_directory) for problem_directory in problem_directories: problem_name = os.path.basename(problem_directory) ap("In problem: {0}".format(problem_name)) os.mkdir(os.path.join(output_corpus_directory, problem_name)) output_file_path = os.path.join(output_corpus_directory, problem_name, "obfuscated.json") obfuscate_author(problem_directory, output_file_path) ap("Completed problem: {0}".format(problem_name))
def build_dockerfile(self): # copy private key into build self.prepare_key() build_tag = self.build_tag() build_cmd = 'docker build -t %(build_tag)s .' % locals() ap(build_cmd) return call(build_cmd.split())
def print_fsm(self): start = self.start ap("Node Id: {}".format(start.id)) for key, next_node in start.nexts.items(): print("key: {} => Node Id: {}".format(key, next_node.id)) print("\n") for key, next_node in start.nexts.items(): ap("From Node {} taking key {}".format(start.id, key)) self.print_sub_fsm(next_node)
def print_fsm( self ): start = self.start ap("Node Id: {}".format( start.id )) for key, next_node in start.nexts.items(): print("key: {} => Node Id: {}".format(key, next_node.id )) print("\n") for key, next_node in start.nexts.items(): ap( "From Node {} taking key {}".format(start.id, key)) self.print_sub_fsm(next_node)
def unload_environment(self): for image, service_config in self.services().iteritems(): container_name = self.container_name_for_service(image, service_config) stop_cmd = 'docker stop %(container_name)s' % locals() ap(stop_cmd) call(stop_cmd.split()) rm_cmd = 'docker rm %(container_name)s' % locals() ap(rm_cmd) call(rm_cmd.split())
def load_environment(self): for image, service_config in self.services().iteritems(): container_name = self.container_name_for_service(image, service_config) envs = '' if service_config and 'env' in service_config: envs = self.envs(service_config['env']) run_cmd = 'docker run -d %(envs)s --name %(container_name)s %(image)s' % locals() ap(run_cmd) result = call(run_cmd.split()) if result > 0: return result return 0
def print_sub_fsm(self, start): ap("Node Id: {}".format( start.id )) if self.end == start: ap("===== Reached End ======") for key, next_node in start.nexts.items(): print("key: {} => Node Id: {}".format(key, next_node.id )) print("\n") for key, next_node in start.nexts.items(): ap( "From Node {} taking key {}".format(start.id, key)) self.print_sub_fsm(next_node)
def print_sub_fsm(self, start): ap("Node Id: {}".format(start.id)) if self.end == start: ap("===== Reached End ======") for key, next_node in start.nexts.items(): print("key: {} => Node Id: {}".format(key, next_node.id)) print("\n") for key, next_node in start.nexts.items(): ap("From Node {} taking key {}".format(start.id, key)) self.print_sub_fsm(next_node)
def process(effect, entry): bar.next() ap('Looking for whether %s in' % effect) ap([ effect for drug in entry["drugs"] for effect in taxonomy[drug]["effects"] ]) ap(1 if effect in [ effect for drug in entry["drugs"] for effect in taxonomy[drug]["effects"] ] else -1) return 1 if effect in [ effect for drug in entry["drugs"] for effect in taxonomy[drug]["effects"] ] else -1
def run_build_steps(self): build_tag = self.build_tag() for step, cmds in self.build_steps().items(): ap(step) all_cmds = ' && '.join(cmds) all_cmds = '/bin/bash -l -c "%(all_cmds)s"' % locals() links = self.service_links() build_run_cmd = 'docker run -i %(links)s %(build_tag)s %(all_cmds)s' % locals() ap(build_run_cmd) result = call(build_run_cmd, shell=True) ap(result) if result != 0: return result return 0
import json from awesome_print import ap drugs = open('master-drug-list').read().splitlines() labels = json.load(open('correlation-matrix-kmeans-cluster-labels.json','rb'))["2"]["labels"] clusters = {label:[drug for i,drug in enumerate(drugs) if labels[i]==label] for label in set(labels)} json.dump(clusters, open("drugs-in-each-cluster.json",'wb')) ap(clusters)
test = {} for key, unconditional_value in zip(["a", "b", "c", "d"], [0.2, 0.3, 0.4, 0.5]): test[key] = {} test[key]["unconditional"] = unconditional_value test[key]["a"] = unconditional_value + 0.1 def parse_query(dictionary, query): ''' P (a | {b,c,d}) = P(a|b) P(a|c) P(a|d) (by assumption) = P(b|a)P(a)P(c|a)P(a)P(d|a)P(a) / P(b)P(c)P(d) = P(b|a) P(c|a) P(d|a) P(a)^^(len(conditionals)) / P(b) P(c) P(d) ''' query = query.strip() prior, conditionals = query.split('|') prior = prior.strip() conditionals = [x.strip() for x in conditionals.split(',')] return np.prod([ dictionary[prior]["unconditional"] if prior not in dictionary[conditional] else (dictionary[conditional][prior] * dictionary[prior]["unconditional"] / dictionary[conditional]["unconditional"]) for conditional in conditionals if conditional in dictionary ]) ap(parse_query(test, "a | b,c,d"))
import numpy as np from collections import Counter from awesome_print import ap TAB = '\t' #-- Load effect matrix (rows=document,cols=effect) effect_matrix = np.loadtxt('../data/effect-matrix.tsv', delimiter=TAB) effects = open('../data/master-class-list').read().splitlines() #--- Prevalence of effects effect_prevalence = { drug: sum(effect_matrix[i, :] == 1) for i, drug in enumerate(effects) } ap(sorted(effect_prevalence.items(), key=lambda item: item[1], reverse=True))
cutoff = 30 palette = sns.color_palette("Set2",cutoff) colors = {effect:palette[i] for i,effect in enumerate(sorted(dict(unique_drug_classes.iteritems()), key=lambda item:item[1],reverse=True)[:cutoff])} L = hierarchy.linkage(m, method='average') Z = hierarchy.dendrogram(L,no_plot=True) cluster_idx = hierarchy.fcluster(L,1.4,depth=4) drug_clusters = {str(i):[drugs[j] for j,pos in enumerate(cluster_idx) if pos==i] for i in set(cluster_idx)} ap(drug_clusters) json.dump(drug_clusters,open('../data/drug-clusters.json','wb')) row_colors = [colors[taxonomy[drugs[idx]]["effects"][0]] for idx in Z["leaves"]] cg = sns.clustermap(m,row_linkage=L,col_linkage=L,cmap=plt.cm.Reds,yticklabels=False, figsize=(8,8),xticklabels=False) cg.savefig('../imgs/drug-drug-correlation-w-cluster.png') ''' ind = hierarchy.fcluster(L, 0.5*d.max(), 'distance') fig = plt.figure() ax2 = fig.add_axes([0.09,0.1,0.5,0.6],frame_on=False) Z1 = hierarchy.dendrogram(L, orientation='left')
#!/usr/bin/env python from awesome_print import ap objects = [ None, True, "Hello, World!", 65535, 3.1415926, [1,2,3,4,5], {'one': 1, 'two': 2, 'ten': 10}, { 'one': 1, 'two': ['uno', 'dos', 'tres'], 'six': { 'ein': 1, 'zwei': 2, 'drei': 3 } } ] print '>> from awesome_print import ap' for object in objects: if type(object) is str: print '>> ap("' + str(object) +'")' else: print '>> ap(' + str(object) +')' ap(object)
from collections import defaultdict from awesome_print import ap # ap("1234567\n890") # ap([1,2,3,4,5,6,7,8,9,0,1]) # ap({'a': 1, 'ab\na': 2}) ap([None, {"aasdfasdf\nbdd": 1, (1,2,3):2, 'b':4, 'q':[1,2,[1,2,3]]}, 2, u"A unicode\nstring\nmore stuff!", [1,2,3,4,5,6,6,7,8,9,9,{'a': u"An em dash: \u2014\nMore characters! \u2192", 'b': -1.445}]], options={'indent': 2})
#create effect matrix def iqr(data): return 0.5*(np.percentile(data,75)-np.percentile(data,25)) db = json.load(open('../data/db.json','rb')) effects = open('../data/master-effect-list','rb').read().splitlines() taxonomy = json.load(open('../data/drug-taxonomy.json','rb')) def process(effect,entry): effects = list(set([x for drug in entry["drugs"] for x in taxonomy[drug]["effects"]])) #Calling x so as not to shadow the argument EFFECT passed to function return 1 if effect in effects else -1 ap('Calculating occurrent matrix') m = np.array([[process(effect,entry) for entry in db.values()] for effect in effects],dtype=int) np.savetxt('../data/effect-matrix.tsv', m, fmt='%d',delimiter='\t') tally = {effect:sum(m[i,:]==1) for i,effect in enumerate(effects)} json.dump(tally,open('../data/frequency-of-effects.json','wb')) json.dump({'expected occurrence':len(db)/float(len(effects)), 'iqr':iqr(tally.values())},open('../data/thresholds.json','wb')) #--- Exclude effects that never occurred idx = [i for i,effect in enumerate(effects) if tally[effect]>200]
influence_kernel = {node:normalize(np.array([influence[predecessor] for predecessor in G.predecessors(node)]).astype(float)) for node in G.nodes_iter()} agent = 0 reps = 1 record = np.zeros((reps,stop)) for rep in range(reps): for t in xrange(start,stop): EFFECT_SIZE = 0 if t < 10 else -gain*n internal_influence = actors[0].calculate_intent_to_drink() actors[0].update({'attitude to medical consequences':EFFECT_SIZE}) ap(actors[0].snapshot(as_dict=True,print_calc=True)) ap(actors[0].inspect_calculation()) effect[t] = actors[0].variables['attitude to medical consequences'] intent[t] = actors[0].variables['intent to drink'] attitudes[agent,t] = attitudes[agent,t-1] + epsilon*((effect-attitudes[agent,t-1]) if (effect-attitudes[agent,t-1]) > THRESHOLDS[agent] else 0) social_influence = attitudes[G.predecessors(agent),t-1].dot(influence_kernel[agent]) #kernel already normalized ''' effect = (1-alpha[agent])*internal_influence + alpha[agent]*social_influence attitudes[agent,t] += (epsilon*(effect if effect > THRESHOLDS[agent] else 0)) #update agent's drinking behavior drinking_behavior[agent,t] = G.node[agent]['actor'].variables['past month drinking'] local_medical_attitudes = np.array([G.node[influencer]['actor'].variables['attitude to medical consequences'] for influencer in G.predecessors(agent)]).dot(influence_kernel[agent])
import json import numpy as np import utils as tech from awesome_print import ap from collections import Counter db = json.load(open('../data/db.json', 'rb')) drugs = open('../data/list_of_drugs', 'rb').read().splitlines() #----- Descriptive Statistics frac = len([x for x in db if len(db[x]["drugs"]) > 0]) / float(len(db)) ap("Fraction of db with recognized drugs: %.04f" % frac) tally = dict(Counter(tech.flatten([db[entry]["drugs"] for entry in db]))) drugs_that_occur = { drug: tally[drug] if drug in tally else 0 for drug in drugs } dto = drugs_that_occur.values() with open('../data/drugs-that-actually-occurr', 'wb') as fid: for drug in tally: print >> fid, drug json.dump(tally, open('../data/drugs-that-actually-occurr.json', 'wb')) #--- What entries have no drugs? ''' for entry in db:
objects = [ ["abc\ndef", {(1,2) : (1,2,3,4,5,6,7,8,9,10,11)}], None, BooleanType, True, "Hello, World!", 65535, 3.1415926, (1,2,3,4,5,6,7,8,9,10,11), [1,2,3,4,5], {'one': 1, 'two': 2, 'ten': 10}, { 'one': 1, 'two': ['uno', 'dos', 'tres'], 'six': { 'ein': 1, 'zwei': 2, 'drei': 3 } } ] print '>> from awesome_print import ap' for object in objects: if type(object) is str: print '>> ap("' + str(object) +'")' else: print '>> ap(' + str(object) +')' ap(object)
#-----INITIALIZE------------------------------------------ data = {} directory = json.load(open(os.path.join(basepath,'directory.json'),READ)) for variable in directory: data[variable] = np.load(directory[variable]) if variable == 'complete record' else np.loadtxt(directory[variable],delimiter = TAB) RESPONDER_FILENAME = os.path.join(basepath,'responders') if not os.path.isfile(RESPONDER_FILENAME): responders = [agent for agent in xrange(data['complete record'].shape[1]) if np.gradient(np.array_split(data['complete record'][:,agent,PAST_MONTH_DRINKING],3)[1]).mean()<0] np.savetxt(RESPONDER_FILENAME,responders,delimiter=TAB,fmt=INTEGER) ap('%d Responders: %s'%(len(responders),' '.join(map(str,responders)))) identified_responders = set(responders) & set(data['at-risk']) ap('%d Responders identified as at-risk: %s'%(len(identified_responders),map(str,identified_responders))) else: responders = np.loadtxt(RESPONDER_FILENAME,delimiter=TAB) overall_population = data['attitudes'].shape[0] yes_response_yes_atrisk = len(set(responders) & set(data['at-risk'])) no_response_yes_atrisk = len(set(data['at-risk']) - set(responders)) no_response_no_atrisk = len(set(range(overall_population)) - set(responders)-set(data['at-risk'])) yes_response_no_atrisk = len(set(responders)-set(data['at-risk'])) #print contingency_table table = Texttable() table.set_cols_align(["r", "l","l","l"])
import couchdb, json, datetime from time import time from awesome_print import ap timestamp = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d') credentials = json.load(open('credentials.json')) couch = couchdb.Server("https://yid.cloudant.com/") couch.resource.credentials = (credentials['username'],credentials['password']) with open('snapshot-%s'%timestamp,'w') as outfile: counter = 0 for id in couch['erowid']: if counter%100==0: ap(counter) record = couch['erowid'][id] print>>outfile,record['text'].strip().encode('utf-8') counter += 1
from awesome_print import ap import numpy as np '''Calculate P (MI | {chest pain, male, 30})''' test = {} for key, unconditional_value in zip(["a","b","c","d"],[0.2,0.3,0.4,0.5]): test[key] = {} test[key]["unconditional"] = unconditional_value test[key]["a"] = unconditional_value + 0.1 def parse_query(dictionary,query): ''' P (a | {b,c,d}) = P(a|b) P(a|c) P(a|d) (by assumption) = P(b|a)P(a)P(c|a)P(a)P(d|a)P(a) / P(b)P(c)P(d) = P(b|a) P(c|a) P(d|a) P(a)^^(len(conditionals)) / P(b) P(c) P(d) ''' query = query.strip() prior, conditionals = query.split('|') prior = prior.strip() conditionals = [x.strip() for x in conditionals.split(',')] return np.prod([dictionary[prior]["unconditional"] if prior not in dictionary[conditional] else (dictionary[conditional][prior]* dictionary[prior]["unconditional"]/ dictionary[conditional]["unconditional"]) for conditional in conditionals if conditional in dictionary]) ap(parse_query(test,"a | b,c,d"))
import json from awesome_print import ap import numpy as np def iqr(data): return 0.5* (np.percentile(data,75)-np.percentile(data,25)) tally= json.load(open('../data/drugs-that-actually-occurr.json','rb')) ap(tally) ap(np.median(tally.values())) ap(9287/float(len(tally))) ap(iqr(tally.values()))
patho_one_ratings = np.array([ i[0][0] if len(i[0]) > 0 else -1 for i in df_one.apply(np.nonzero, axis=1).values ]).astype(int) patho_two_ratings = np.array([ i[0][0] if len(i[0]) > 0 else -1 for i in df_two.apply(np.nonzero, axis=1).values ]).astype(int) #Really inefficient implementation, but too many exceptions to vectorize: contingency_table = np.zeros((3, 3)) for rating_one in patho_one_ratings: if type(rating_one) == type(list): rating_one = rating_one[0] for rating_two in patho_two_ratings: if type(rating_two) == type(list): rating_two = rating_two[0] print '\t %d' % rating_two contingency_table[rating_one, rating_two] += 1 lvsi['%s-%s' % (pathologist_one, pathologist_two)] = cohens_kappa(contingency_table).kappa json.dump(lvsi, open('../data/lvsi-stains-grades.json', 'wb')) ap(np.median(lvsi.values())) print 0.5 * (np.percentile(lvsi.values(), 75) - np.percentile(lvsi.values(), 25))
import numpy as np from collections import Counter from awesome_print import ap TAB = '\t' #-- Load effect matrix (rows=document,cols=effect) effect_matrix = np.loadtxt('../data/effect-matrix.tsv',delimiter=TAB) effects = open('../data/master-class-list').read().splitlines() #--- Prevalence of effects effect_prevalence = {drug:sum(effect_matrix[i,:]==1) for i,drug in enumerate(effects)} ap(sorted(effect_prevalence.items(),key=lambda item:item[1],reverse=True))
def check_consistency(self): self.all_nodes for source_node in self.all_nodes: # ap(source_node.nexts) # ap(source_node.previouses) for key, next_node in source_node.nexts.items(): target_node = source_node.nexts[key] if target_node.previouses.get(key, None) != source_node: ap( "A bug problem: Next") ap( "{} {}".format(source_node.id, key, target_node.id)) return if target_node not in self.all_nodes: ap( "Suspicious New Node in way of nexts") ap( target_node.id) return for key, prev_node in source_node.previouses.items(): target_node = source_node.previouses[key] if target_node.nexts.get(key, None) != source_node: ap( "A bug problem: Previous") ap( "{} {}".format(source_node.id, key, target_node.id)) return None if target_node not in self.all_nodes: ap( "Suspicious New Node in way previouses") ap( target_node.id) return
import json from sys import argv from awesome_print import ap filename = argv[1] labels = json.load(open('../data/%s-matrix-kmeans-cluster-labels.json'%filename,'rb'))["4"]["labels"] if filename != 'taxonomy': drugs = open('../data/master-drug-list','rb').read().splitlines() clusters = {label:[drug for i,drug in enumerate(drugs) if labels[i]==label] for label in set(labels)} else: effects = open('../data/taxonomy-classes','rb').read().splitlines() clusters={label:[effect for i,effect in enumerate(effects) if labels[i]==label] for label in set(labels)} json.dump(clusters, open("../data/drugs-in-each-cluster-%s.json"%filename,'wb')) ap(clusters)
import numpy as np import matplotlib.pyplot as plt from awesome_print import ap def unique_words(aStr): return ' '.join([word for word in set(aStr.split())]) TEXT = 1 basis_vectors = [unique_words(line.split(':')[TEXT]) for line in open('lda-topics.txt','rb').read().splitlines()] stopwords = set(open('stopwords.txt').read().splitlines()) data = filter(lambda x: x != 'none',[shift['Student Comment'].lower().strip() for shift in csv.DictReader(open('comments.csv'))]) data = [' '.join([word for word in set(string.split()) if word not in stopwords]) for string in data] ap(basis_vectors) def jaccard_similarity(a,b): a = set(a) b = set(b) return len(a & b)/float(len(a | b)) def gs(X, row_vecs=True, norm = True): if not row_vecs: X = X.T Y = X[0:1,:].copy() for i in range(1, X.shape[0]): proj = np.diag((X[i,:].dot(Y.T)/np.linalg.norm(Y,axis=1)**2).flat).dot(Y) Y = np.vstack((Y, X[i,:] - proj.sum(0))) if norm:
import json import numpy as np from awesome_print import ap from sys import argv nclus = argv[1] labels = json.load( open('../data/effect-correlation-matrix-kmeans-cluster-labels.json', 'rb'))[str(nclus)]["labels"] #Only care about labels for two cluster case effects = open('../data/master-class-list', 'rb').read().splitlines() print len(effects) print len(labels) d = { cluster: [effect for i, effect in enumerate(effects) if labels[i] == int(cluster)] for cluster in np.unique(labels) } ap(d)
import matplotlib.pyplot as plt import Graphics as artist from sys import argv from awesome_print import ap from matplotlib import rcParams rcParams['text.usetex'] = True filename = argv[1] drugs = open('../data/master-drug-list','rb').read().splitlines() m = np.loadtxt('../data/%s-matrix.tsv'%filename,delimiter='\t') fig = plt.figure() ax = fig.add_subplot(111) ax.hist(m.ravel(),histtype='stepfilled',color='k',alpha=0.7) artist.adjust_spines(ax) ax.set_xlabel(artist.format(filename)) plt.yscale('log', nonposy='clip') plt.tight_layout() plt.savefig('../imgs/%s-matrix-distribution.png'%filename) cutoff = np.percentile(m,85) popular_combinations = {'%s-%s'%(one,two): m[i,j] for i,one in enumerate(drugs) for j,two in enumerate(drugs) if m[i,j] > cutoff and j<i} ap(sorted(popular_combinations.items(),key=lambda item:item[1],reverse=True))
pca = PCA(n_components=8,whiten=True) X = pca.fit_transform(xm) print pca.explained_variance_ratio_ nclus = 3 clustering = AgglomerativeClustering(linkage='ward', n_clusters=nclus) Y = clustering.fit_predict(X) d = {label:[effect for i,effect in enumerate(effects) if clustering.labels_[i] == label] for label in np.unique(clustering.labels_)} ap(d) ct = [sum(m[:,i]>0) for i in xrange(m.shape[0])] ap(dict(zip(effects,ct))) ''' colors = dict(zip(np.unique(clustering.labels_), brewer2mpl.get_map('Set2', 'qualitative',8).mpl_colors)) shapes = dict(zip(np.unique(clustering.labels_),['o','s','*','x','D'])) loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals fig, axs = plt.subplots(ncols=2,sharex=True,sharey=True,figsize=(8,8)) axs[0].scatter(X[:,0],X[:,1],c=X[:,2],cmap=plt.cm.seismic,s=35, clip_on=False) artist.adjust_spines(axs[0]) axs[0].set_xlabel(artist.format("PC 1")) axs[0].set_ylabel(artist.format("PC 2"))
import couchdb, json, datetime from time import time from awesome_print import ap timestamp = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d') credentials = json.load(open('credentials.json')) couch = couchdb.Server("https://yid.cloudant.com/") couch.resource.credentials = (credentials['username'], credentials['password']) with open('snapshot-%s' % timestamp, 'w') as outfile: counter = 0 for id in couch['erowid']: if counter % 100 == 0: ap(counter) record = couch['erowid'][id] print >> outfile, record['text'].strip().encode('utf-8') counter += 1
X = pca.fit_transform(xm) print pca.explained_variance_ratio_ nclus = 3 clustering = AgglomerativeClustering(linkage='ward', n_clusters=nclus) Y = clustering.fit_predict(X) d = { label: [ effect for i, effect in enumerate(effects) if clustering.labels_[i] == label ] for label in np.unique(clustering.labels_) } ap(d) ct = [sum(m[:, i] > 0) for i in xrange(m.shape[0])] ap(dict(zip(effects, ct))) ''' colors = dict(zip(np.unique(clustering.labels_), brewer2mpl.get_map('Set2', 'qualitative',8).mpl_colors)) shapes = dict(zip(np.unique(clustering.labels_),['o','s','*','x','D'])) loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals fig, axs = plt.subplots(ncols=2,sharex=True,sharey=True,figsize=(8,8)) axs[0].scatter(X[:,0],X[:,1],c=X[:,2],cmap=plt.cm.seismic,s=35, clip_on=False) artist.adjust_spines(axs[0]) axs[0].set_xlabel(artist.format("PC 1")) axs[0].set_ylabel(artist.format("PC 2"))
import json, itertools import utils as tech from awesome_print import ap #Are there any standardized drug names not in the taxonomy? READ = 'rb' taxonomy = json.load(open('drug-taxonomy.json',READ)) standardized_names = json.load(open('drug_misnaming.json',READ)) if len(set(tech.flatten(standardized_names.values())) - set(taxonomy.keys())) == 0: ap("All standardized drug names are in the taxonomy.")
db = json.load(open('../data/db.json', 'rb')) effects = open('../data/master-effect-list', 'rb').read().splitlines() taxonomy = json.load(open('../data/drug-taxonomy.json', 'rb')) def process(effect, entry): effects = list( set([x for drug in entry["drugs"] for x in taxonomy[drug]["effects"]])) #Calling x so as not to shadow the argument EFFECT passed to function return 1 if effect in effects else -1 ap('Calculating occurrent matrix') m = np.array([[process(effect, entry) for entry in db.values()] for effect in effects], dtype=int) np.savetxt('../data/effect-matrix.tsv', m, fmt='%d', delimiter='\t') tally = {effect: sum(m[i, :] == 1) for i, effect in enumerate(effects)} json.dump(tally, open('../data/frequency-of-effects.json', 'wb')) json.dump( { 'expected occurrence': len(db) / float(len(effects)), 'iqr': iqr(tally.values()) }, open('../data/thresholds.json', 'wb'))
stopwords = set(open('stopwords.txt').read().splitlines()) punkt = set(string.punctuation) upper_half = json.load(open('upper_half.json','rb')) lower_half = json.load(open('lower_half.json','rb')) comments = list(csv.DictReader(open('comments.csv','rb'))) comments = [comment for comment in comments if comment['Student Comment'] != 'None' and comment['Student Comment'] !='NA'] upper_half_comments = [comment for comment in comments if comment['Name'] in upper_half.keys()] lower_half_comments = [comment for comment in comments if comment['Name'] in lower_half.keys()] ap('Upper len :%d'%len(upper_half_comments)) ap('Lower len :%d'%len(lower_half_comments)) upper_half_vocabulary =' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in upper_half_comments]))) lower_half_vocabulary = ' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in lower_half_comments]))) upper_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(upper_half_vocabulary)) if word not in punkt and word not in stopwords] lower_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(lower_half_vocabulary)) if word not in punkt and word not in stopwords] upper_freqs = nltk.FreqDist(upper_half_words) lower_freqs = nltk.FreqDist(lower_half_words) print tech.weighted_jaccard_similarity(upper_freqs,lower_freqs)
import json from awesome_print import ap import numpy as np def iqr(data): return 0.5 * (np.percentile(data, 75) - np.percentile(data, 25)) tally = json.load(open('../data/drugs-that-actually-occurr.json', 'rb')) ap(tally) ap(np.median(tally.values())) ap(9287 / float(len(tally))) ap(iqr(tally.values()))
import json import numpy as np from awesome_print import ap from sys import argv nclus = argv[1] labels = json.load(open('../data/effect-correlation-matrix-kmeans-cluster-labels.json','rb'))[str(nclus)]["labels"] #Only care about labels for two cluster case effects = open('../data/master-class-list','rb').read().splitlines() print len(effects) print len(labels) d = {cluster:[effect for i,effect in enumerate(effects) if labels[i]==int(cluster)] for cluster in np.unique(labels)} ap(d)
with open('comments.csv',READ) as csvfile: comments = [row for row in csv.DictReader(csvfile)] students = {entry['Name']:"" for entry in comments} def process(text): return ' '.join([nltk.stem.WordNetLemmatizer().lemmatize(word) for word in word_tokenize(text.decode('utf-8').encode('ascii','ignore').lower()) if word not in stopwords and word not in ['x','na']]) for student in students: students[student] = process(' '.join([entry['Student Comment'] for entry in comments if entry['Name']==student])) ap(students) keys = [student for student in students.iterkeys() if len(students[student]) > 0] tfx = TfidfVectorizer([students[student] for student in keys],tokenizer=word_tokenize,strip_accents='unicode', min_df=3, use_idf=True) tfidf = tfx.fit_transform([students[student] for student in keys]) #--LSA pca = PCA(n_components=3) model = pca.fit_transform(tfidf.toarray()) ap(pca.explained_variance_ratio_) #--lda model = lda.LDA(n_topics=3, n_iter=1000,random_state=1) model.fit(tfidf)
def obfuscate_author(author_input_directory, output_file_path): global tokenizer global verb_tags global adjective_tags global noun_tags global adverb_tags global english_stopwords global stemmer global is_american global american_to_british global british_to_american global british_american_dic global american_british_dic global vocabulary tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') verb_tags = ["VB", "VBD", "VBG", "VBN", "VBP"] adjective_tags = ["JJ", "JJR", "JJS"] noun_tags = ["NN", "NNS"] adverb_tags = ["RB", "RBR", "RBS"] english_stopwords = stopwords.words('english') stemmer = PorterStemmer() obfuscation_document_content = codecs.open( os.path.join(author_input_directory, "original.txt") , "r", "utf-8").read().strip() lines = tokenizer.tokenize(obfuscation_document_content) position_tags = [(0,len(lines[0]))] for i in range(1,len(lines)): position_tags.append( ((position_tags[i-1][1] + 1) , (position_tags[i-1][1] + len(lines[i]) ) ) ) lines = [line.replace("\r\n", " ").replace("\n", " ") for line in lines] obfuscated_lines = [] vocabulary , average_document_length = (avg_length_and_vocab(author_input_directory)) is_american = assign_american_or_brit(vocabulary, 'change.csv') british_to_american , american_to_british = read_dictionaries( 'change.csv') for line in lines: ap("About to obfuscate:") print(line) obfuscated_line = obfuscate(line) obfuscated_lines.append(obfuscated_line) ap("Obfuscated line is:") print(obfuscated_line) ap("--------------------------") output_dictionary = {} obfuscations = [] for i in range(len(lines)): obfuscation = {} obfuscation["original"] = lines[i] obfuscation["original-start-charpos"] = position_tags[i][0] obfuscation["original-end-charpos"] = position_tags[i][1] obfuscation["obfuscation"] = obfuscated_lines[i] obfuscation["obfuscation-id"] = i+1 obfuscations.append(obfuscation) obfuscation_file = codecs.open( output_file_path ,"w", "utf-8") obfuscation_file.write(json.dumps(obfuscations) ) obfuscation_file.close()
def process(text): return ' '.join([nltk.stem.WordNetLemmatizer().lemmatize(word) for word in word_tokenize(text.decode('utf-8').encode('ascii','ignore').lower()) if word not in stopwords and word not in ['x','na'] and word not in set(string.punctuation)]) for student in students: students[student] = process(' '.join([entry['Student Comment'] for entry in comments if entry['Name']==student])) #ap(students) keys = [student for student in students.iterkeys() if len(students[student]) > 0] tfx = TfidfVectorizer([students[student] for student in keys],tokenizer=word_tokenize,strip_accents='unicode', min_df=3, use_idf=True) tfidf = tfx.fit_transform([students[student] for student in keys]) lda = LatentDirichletAllocation(n_topics=8,max_iter=5,learning_method='online') model = lda.fit_transform(tfidf) ''' ap(dir(tfx)) ap(dict(zip(tfx.get_feature_names(),tfx.idf_))) ''' #--lda feature_names = tfx.get_feature_names() feature_weights = tfx.idf_ n_top_words = 20 features = zip(feature_weights,feature_names) with open('sklearn-lda-topics-w-weights','wb') as f: for topic_idx, topic in enumerate(lda.components_): print>>f, " : ".join(["%.4f * %s"%(features[i][0],features[i][1]) for i in topic.argsort()[:-n_top_words - 1:-1]])
import json import numpy as np from awesome_print import ap #threshold = .030888030888 #threshold /= (161*80) #Bonferroni correction threshold =0.206349206349 ap(threshold) READ = 'rb' m = np.loadtxt('../data/drug-jaccard-similarity-actually-occurred.tsv',delimiter='\t') drugs = open('../data/drugs-that-actually-occurr',READ).read().splitlines() taxonomy = json.load(open('../data/drug-taxonomy.json',READ)) d = {'%s-%s'%(drugs[i],drugs[j]):m[i,j] for i,j in zip(*np.where(np.tril(m,k=-1)>threshold))} ap(d) ap(len(d)) json.dump(d,open('../data/significantly-similar-pairs','wb')) #Which mixing probabilities occur more than by chance? #Prevalence of different classes
def sqeeze(self): all_merges_made = set() merges_made = 100 while merges_made > 0: merges_made = 0 for node_x in self.all_nodes: word_keys = map( lambda key: ParseForest.id_to_word_dictionary[int(key)] , node_x.nexts.keys() ) for word_key, word_key_count in Counter(word_keys).items(): if word_key_count > 1: keys = filter( lambda key: ParseForest.id_to_word_dictionary[int(key)] == word_key , node_x.nexts.keys()) nodes = map( lambda key: node_x.nexts[key], keys) for node_a, node_b in itertools.combinations( nodes, 2): if node_a.id != node_b.id: intersection = [x for x in [node_a.id, node_b.id] if x in [self.start.id, self.end.id]] if not all_merges_made.__contains__( "-".join([str(node_a.id), str(node_b.id)])): if not self.is_remotely_connected(node_a, node_b): if node_b in self.all_nodes: self.merge_fsm_nodes(node_a, node_b) all_merges_made.add("-".join([str(node_a.id), str(node_b.id)])) all_merges_made.add("-".join([str(node_b.id), str(node_a.id)])) merges_made += 1 elif self.is_connected(node_a, node_b): if node_b in node_a.nexts.values(): word_id = ParseForest.next_unique_word_id("*e*") node_a.nexts[word_id] = node_b node_b.previouses[word_id] = node_a redundant_edge = filter( lambda key: node_x.nexts.get(key, None) == node_b, keys) if (len(redundant_edge) > 0): redundant_edge = redundant_edge[0] target_node = node_x.nexts.get(redundant_edge, None) if target_node: node_x.nexts.__delitem__(redundant_edge) target_node.previouses.__delitem__(redundant_edge) else: word_id = ParseForest.next_unique_word_id("*e*") node_b.nexts[word_id] = node_a node_a.previouses[word_id] = node_b redundant_edge = filter( lambda key: node_x.nexts.get(key, None) == node_a, keys) if (len(redundant_edge) > 0): redundant_edge = redundant_edge[0] target_node = node_x.nexts.get(redundant_edge, None) if target_node: node_x.nexts.__delitem__(redundant_edge) target_node.previouses.__delitem__(redundant_edge) all_merges_made.add("-".join([str(node_a.id), str(node_b.id)])) merges_made = 100 while merges_made > 0: merges_made = 0 for node_x in self.all_nodes: word_keys = map( lambda key: ParseForest.id_to_word_dictionary[int(key)] , node_x.previouses.keys() ) for word_key, word_key_count in Counter(word_keys).items(): if word_key_count > 1: keys = filter( lambda key: ParseForest.id_to_word_dictionary[int(key)] == word_key , node_x.previouses.keys()) nodes = map( lambda key: node_x.previouses[key], keys) for node_a, node_b in itertools.combinations( nodes, 2): if node_a.id != node_b.id: intersection = [x for x in [node_a.id, node_b.id] if x in [self.start.id, self.end.id]] if not all_merges_made.__contains__( "-".join([str(node_a.id), str(node_b.id)])): if not self.is_remotely_connected(node_a, node_b): if node_b in self.all_nodes: self.merge_fsm_nodes(node_a, node_b) all_merges_made.add("-".join([str(node_a.id), str(node_b.id)])) all_merges_made.add("-".join([str(node_b.id), str(node_a.id)])) merges_made += 1 elif self.is_connected(node_a, node_b): if node_b in node_a.nexts.values(): word_id = ParseForest.next_unique_word_id("*e*") node_a.nexts[word_id] = node_b node_b.previouses[word_id] = node_a redundant_edge = filter( lambda key: node_x.previouses.get(key, None) == node_a, keys) if (len(redundant_edge) > 0): redundant_edge = redundant_edge[0] target_node = node_x.previouses.get(redundant_edge, None) if target_node: node_x.previouses.__delitem__(redundant_edge) target_node.nexts.__delitem__(redundant_edge) else: word_id = ParseForest.next_unique_word_id("*e*") node_b.nexts[word_id] = node_a node_a.previouses[word_id] = node_b redundant_edge = filter( lambda key: node_x.previouses.get(key, None) == node_b, keys) if (len(redundant_edge) > 0): redundant_edge = redundant_edge[0] target_node = node_x.previouses.get(redundant_edge, None) if target_node: node_x.previouses.__delitem__(redundant_edge) target_node.nexts.__delitem__(redundant_edge) all_merges_made.add("-".join([str(node_a.id), str(node_b.id)])) ap(all_merges_made)
for rating_one,rating_two in zip(patho_one_ratings,patho_two_ratings): #print rating_one,rating_two if type(rating_one) == type(list): rating_one = rating_one[0] if type(rating_two) == type(list): rating_two = rating_two[0] #print i contingency_table[j,rating_one,rating_two] += 1 kappas['%s-%s'%(pathologist_one,pathologist_two)] = cohens_kappa(contingency_table[j,:,:].squeeze()).kappa print np.median(contingency_table,axis=0) print 0.5*(np.percentile(contingency_table,75,axis=0) - np.percentile(contingency_table,25,axis=0)) json.dump(kappas,open('kappa-by-grade-no-ihc.json','wb')) ap(np.median(kappas.values())) print 0.5*(np.percentile(kappas.values(),75)-np.percentile(kappas.values(),25)) ''' df_one = pd.read_excel('stains.xls',pathologist,parse_cols=cols_with_grades, convert_float=False) df_two = pd.read_excel('no-stain.xls',pathologist,parse_cols=cols_with_grades, convert_float=False) patho_one_ratings = np.array([i[0][0] if len(i[0]) > 0 else -1 for i in df_one.apply(np.nonzero,axis=1).values]).astype(int) patho_two_ratings = np.array([i[0][0] if len(i[0]) > 0 else -1 for i in df_two.apply(np.nonzero,axis=1).values]).astype(int) #Really inefficient implementation, but too many exceptions to vectorize: contingency_table = np.zeros((3,3)) for rating_one in patho_one_ratings: if type(rating_one) == type(list):
import json import numpy as np from awesome_print import ap #threshold = .030888030888 #threshold /= (161*80) #Bonferroni correction threshold = 0.206349206349 ap(threshold) READ = 'rb' m = np.loadtxt('../data/drug-jaccard-similarity-actually-occurred.tsv', delimiter='\t') drugs = open('../data/drugs-that-actually-occurr', READ).read().splitlines() taxonomy = json.load(open('../data/drug-taxonomy.json', READ)) d = { '%s-%s' % (drugs[i], drugs[j]): m[i, j] for i, j in zip(*np.where(np.tril(m, k=-1) > threshold)) } ap(d) ap(len(d)) json.dump(d, open('../data/significantly-similar-pairs', 'wb')) #Which mixing probabilities occur more than by chance? #Prevalence of different classes
#! /usr/bin/env python import requests from awesome_print import ap if __name__ == '__main__': resp = requests.get('https://api.github.com/users/macro1/repos') ap(resp.json())
labels,freqs = zip(*sorted(effect_tally.items(),key=lambda item:item[1],reverse=True)[:cutoff]) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(freqs,'k--',linewidth=2) artist.adjust_spines(ax) ax.set_xticks(xrange(len(labels))) ax.set_xticklabels(map(artist.format,labels),rotation='vertical') ax.set_ylabel(artist.format('No. of mentions')) plt.tight_layout() plt.savefig('../imgs/effect-frequency.png') del fig,ax ''' ''' #How often does each class occur in the database? class_tally = dict(Counter(list(tech.flatten([taxonomy[drug]["class"] for drug in tally.keys()])))) ap(class_tally) ''' for drug in taxonomy: if 'class' not in taxonomy[drug]: ap(drug) ''' labels,freqs = zip(*sorted(class_tally.items(),key=lambda item:item[1],reverse=True)[:cutoff]) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(freqs,'k--',linewidth=2)