Example #1
0
  def publish(self):
    if not 'publish' in self.rules:
      return

    publish_rules = self.rules['publish']

    if not 'repository' in publish_rules or not 'name' in publish_rules:
      ap("Nowhere to publish..")
      return 1

    build_tag  = self.build_tag()
    repository = publish_rules['repository']
    name       = publish_rules['name']

    ap("Tagging")
    tag_cmd = 'docker tag %(build_tag)s %(repository)s/%(name)s' % locals()
    ap(tag_cmd)
    tag_result = call(tag_cmd.split())
    if tag_result > 0:
      return tag_result

    ap("Publishing")
    publish_cmd = 'docker push %(repository)s/%(name)s' % locals()
    ap(publish_cmd)
    return call(publish_cmd.split())
Example #2
0
def plot_variable(data,basepath=None,dataname='',criterion=None, criterionname=[]):
	fig = plt.figure()
	ax = fig.add_subplot(111)
	x = range(data.shape[1])
	ap('Plotting %s'%dataname)
	if criterion != None:
		if type(criterion) != list:
			median, lq, uq = perc(data[criterion,:])
			ax.plot(x,median,linewidth=2, color='#B22400')
			ax.fill_between(x, lq, uq, alpha=0.25, linewidth=0, color='#B22400')
		else:
			bmap = brewer2mpl.get_map('Set2', 'qualitative', 7)
			colors = bmap.mpl_colors
			for i,(x_criterion,x_label) in enumerate(itertools.izip_longest(criterion,criterionname,fillvalue='Group')):
				median, lq, uq = perc(data[x_criterion,:])
				ax.plot(x,median,linewidth=2, color=colors[i], label=artist.format(x_label))
				ax.fill_between(x, lq, uq, alpha=0.25, linewidth=0, color=colors[i])
	
	median, lq, uq = perc(data)
	ax.plot(x,median,linewidth=2, color='#B22400',label=artist.format('Full population'))
	ax.fill_between(x, lq, uq, alpha=0.25, linewidth=0, color='#B22400')
	
	artist.adjust_spines(ax)
	ax.set_ylabel(artist.format(dataname))
	ax.set_xlabel(artist.format('Time'))
	ax.axvline(data.shape[1]/3,color='r',linewidth=2,linestyle='--')
	ax.axvline(2*data.shape[1]/3,color='r',linewidth=2,linestyle='--')
	plt.legend(frameon=False,loc='lower left')
	plt.tight_layout()
	plt.savefig(os.path.join(basepath,'%s.png'%dataname))
def main(argv):
    input_corpus_directory = ''
    output_corpus_directory = ''
    try:
        print(argv)
        opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
    except getopt.GetoptError:
        print 'Usage is: python obfuscation_software.py -i <input_corpus_directory> -o <output_corpus_directory>'
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-i", "--ifile"):
            input_corpus_directory = arg
        elif opt in ("-o", "--ofile"):
            output_corpus_directory = arg

    problem_directories = glob.glob(os.path.join(input_corpus_directory, "*"))

    if (os.path.exists(output_corpus_directory)):
        shutil.rmtree(output_corpus_directory)
    os.mkdir(output_corpus_directory)

    for problem_directory in problem_directories:
        problem_name = os.path.basename(problem_directory)
        ap("In problem:  {0}".format(problem_name))
        os.mkdir(os.path.join(output_corpus_directory, problem_name))
        output_file_path = os.path.join(output_corpus_directory, problem_name,
                                        "obfuscated.json")
        obfuscate_author(problem_directory, output_file_path)
        ap("Completed problem:  {0}".format(problem_name))
Example #4
0
  def build_dockerfile(self):
    # copy private key into build
    self.prepare_key()

    build_tag = self.build_tag()
    build_cmd = 'docker build -t %(build_tag)s .' % locals()
    ap(build_cmd)
    return call(build_cmd.split())
Example #5
0
 def print_fsm(self):
     start = self.start
     ap("Node Id: {}".format(start.id))
     for key, next_node in start.nexts.items():
         print("key: {}  => Node Id: {}".format(key, next_node.id))
     print("\n")
     for key, next_node in start.nexts.items():
         ap("From Node {} taking key {}".format(start.id, key))
         self.print_sub_fsm(next_node)
	def print_fsm( self ):
		start = self.start
		ap("Node Id: {}".format( start.id ))
		for key, next_node in start.nexts.items():
			print("key: {}  => Node Id: {}".format(key, next_node.id ))
		print("\n")
		for key, next_node in start.nexts.items():
			ap( "From Node {} taking key {}".format(start.id, key))
			self.print_sub_fsm(next_node)
Example #7
0
  def unload_environment(self):
    for image, service_config in self.services().iteritems():
      container_name = self.container_name_for_service(image, service_config)

      stop_cmd = 'docker stop %(container_name)s' % locals()
      ap(stop_cmd)
      call(stop_cmd.split())

      rm_cmd = 'docker rm %(container_name)s' % locals()
      ap(rm_cmd)
      call(rm_cmd.split())
Example #8
0
  def load_environment(self):
    for image, service_config in self.services().iteritems():
      container_name = self.container_name_for_service(image, service_config)

      envs = ''
      if service_config and 'env' in service_config:
        envs = self.envs(service_config['env'])

      run_cmd = 'docker run -d %(envs)s --name %(container_name)s %(image)s' % locals()
      ap(run_cmd)
      result = call(run_cmd.split())
      if result > 0:
        return result

    return 0
	def print_sub_fsm(self, start):
		ap("Node Id: {}".format( start.id ))
		if self.end == start:
			ap("===== Reached End ======")

		for key, next_node in start.nexts.items():
			print("key: {}  => Node Id: {}".format(key, next_node.id ))
		print("\n")
		for key, next_node in start.nexts.items():
			ap( "From Node {} taking key {}".format(start.id, key))
			self.print_sub_fsm(next_node)
Example #10
0
    def print_sub_fsm(self, start):
        ap("Node Id: {}".format(start.id))
        if self.end == start:
            ap("===== Reached End ======")

        for key, next_node in start.nexts.items():
            print("key: {}  => Node Id: {}".format(key, next_node.id))
        print("\n")
        for key, next_node in start.nexts.items():
            ap("From Node {} taking key {}".format(start.id, key))
            self.print_sub_fsm(next_node)
Example #11
0
def process(effect, entry):
    bar.next()

    ap('Looking for whether %s in' % effect)
    ap([
        effect for drug in entry["drugs"]
        for effect in taxonomy[drug]["effects"]
    ])
    ap(1 if effect in [
        effect for drug in entry["drugs"]
        for effect in taxonomy[drug]["effects"]
    ] else -1)
    return 1 if effect in [
        effect for drug in entry["drugs"]
        for effect in taxonomy[drug]["effects"]
    ] else -1
Example #12
0
  def run_build_steps(self):
    build_tag = self.build_tag()

    for step, cmds in self.build_steps().items():
      ap(step)
      all_cmds = ' && '.join(cmds)
      all_cmds = '/bin/bash -l -c "%(all_cmds)s"' % locals()
      links = self.service_links()

      build_run_cmd = 'docker run -i %(links)s %(build_tag)s %(all_cmds)s' % locals()
      ap(build_run_cmd)

      result = call(build_run_cmd, shell=True)
      ap(result)

      if result != 0:
        return result

    return 0
import json

from awesome_print import ap 
drugs = open('master-drug-list').read().splitlines()
labels = json.load(open('correlation-matrix-kmeans-cluster-labels.json','rb'))["2"]["labels"]


clusters = {label:[drug for i,drug in enumerate(drugs)
			if labels[i]==label] for label in set(labels)}

json.dump(clusters, open("drugs-in-each-cluster.json",'wb'))

ap(clusters)
test = {}
for key, unconditional_value in zip(["a", "b", "c", "d"],
                                    [0.2, 0.3, 0.4, 0.5]):
    test[key] = {}
    test[key]["unconditional"] = unconditional_value
    test[key]["a"] = unconditional_value + 0.1


def parse_query(dictionary, query):
    '''  P (a | {b,c,d})
		 = P(a|b) P(a|c) P(a|d) (by assumption)
		 = P(b|a)P(a)P(c|a)P(a)P(d|a)P(a) / P(b)P(c)P(d)
		 = P(b|a) P(c|a) P(d|a) P(a)^^(len(conditionals)) / P(b) P(c) P(d)
	'''
    query = query.strip()
    prior, conditionals = query.split('|')

    prior = prior.strip()
    conditionals = [x.strip() for x in conditionals.split(',')]

    return np.prod([
        dictionary[prior]["unconditional"]
        if prior not in dictionary[conditional] else
        (dictionary[conditional][prior] * dictionary[prior]["unconditional"] /
         dictionary[conditional]["unconditional"])
        for conditional in conditionals if conditional in dictionary
    ])


ap(parse_query(test, "a | b,c,d"))
Example #15
0
import numpy as np

from collections import Counter
from awesome_print import ap
TAB = '\t'
#-- Load effect matrix (rows=document,cols=effect)
effect_matrix = np.loadtxt('../data/effect-matrix.tsv', delimiter=TAB)
effects = open('../data/master-class-list').read().splitlines()
#--- Prevalence of effects

effect_prevalence = {
    drug: sum(effect_matrix[i, :] == 1)
    for i, drug in enumerate(effects)
}

ap(sorted(effect_prevalence.items(), key=lambda item: item[1], reverse=True))
cutoff = 30

palette = sns.color_palette("Set2",cutoff)
colors = {effect:palette[i] for i,effect in 
						enumerate(sorted(dict(unique_drug_classes.iteritems()),
								key=lambda item:item[1],reverse=True)[:cutoff])}

L = hierarchy.linkage(m, method='average')
Z = hierarchy.dendrogram(L,no_plot=True)

cluster_idx = hierarchy.fcluster(L,1.4,depth=4)

drug_clusters = {str(i):[drugs[j] for j,pos in enumerate(cluster_idx) if pos==i] 
				for i in set(cluster_idx)}

ap(drug_clusters)
json.dump(drug_clusters,open('../data/drug-clusters.json','wb'))
row_colors = [colors[taxonomy[drugs[idx]]["effects"][0]] for idx in Z["leaves"]]

cg = sns.clustermap(m,row_linkage=L,col_linkage=L,cmap=plt.cm.Reds,yticklabels=False,
	figsize=(8,8),xticklabels=False)

cg.savefig('../imgs/drug-drug-correlation-w-cluster.png')

'''
ind = hierarchy.fcluster(L, 0.5*d.max(), 'distance')

fig = plt.figure()

ax2 = fig.add_axes([0.09,0.1,0.5,0.6],frame_on=False)
Z1 = hierarchy.dendrogram(L, orientation='left')
Example #17
0
#!/usr/bin/env python
from awesome_print import ap

objects = [
        None,
        True,
        "Hello, World!",
        65535,
        3.1415926,
        [1,2,3,4,5],
        {'one': 1, 'two': 2, 'ten': 10},
        {
            'one': 1,
            'two': ['uno', 'dos', 'tres'],
            'six': {
                'ein': 1,
                'zwei': 2,
                'drei': 3
            }
        }
        ]

print '>> from awesome_print import ap'
for object in objects:
    if type(object) is str:
        print '>> ap("' + str(object) +'")'
    else:
        print '>> ap(' + str(object) +')'
    ap(object)
Example #18
0
from collections import defaultdict
from awesome_print import ap

# ap("1234567\n890")

# ap([1,2,3,4,5,6,7,8,9,0,1])
# ap({'a': 1, 'ab\na': 2})

ap([None, {"aasdfasdf\nbdd": 1, (1,2,3):2, 'b':4, 'q':[1,2,[1,2,3]]}, 2, u"A unicode\nstring\nmore stuff!", [1,2,3,4,5,6,6,7,8,9,9,{'a': u"An em dash: \u2014\nMore characters! \u2192", 'b': -1.445}]], options={'indent': 2})
Example #19
0
#create effect matrix

def iqr(data):
	return 0.5*(np.percentile(data,75)-np.percentile(data,25))

db = json.load(open('../data/db.json','rb'))
effects = open('../data/master-effect-list','rb').read().splitlines()
taxonomy = json.load(open('../data/drug-taxonomy.json','rb'))
def process(effect,entry):
	effects = list(set([x for drug in entry["drugs"]
						  for x in taxonomy[drug]["effects"]]))

	#Calling x so as not to shadow the argument EFFECT passed to function
	return 1 if effect in effects else -1

ap('Calculating occurrent matrix')
m = np.array([[process(effect,entry)
					for entry in db.values()]
					for effect in effects],dtype=int)

np.savetxt('../data/effect-matrix.tsv', m, fmt='%d',delimiter='\t')

tally = {effect:sum(m[i,:]==1) for i,effect in enumerate(effects)}

json.dump(tally,open('../data/frequency-of-effects.json','wb'))
json.dump({'expected occurrence':len(db)/float(len(effects)),
		'iqr':iqr(tally.values())},open('../data/thresholds.json','wb'))

#--- Exclude effects that never occurred 

idx = [i for i,effect in enumerate(effects) if tally[effect]>200]
influence_kernel = {node:normalize(np.array([influence[predecessor] 
						for predecessor in G.predecessors(node)]).astype(float))
 						for node in G.nodes_iter()}

agent = 0
reps = 1

record = np.zeros((reps,stop))
for rep in range(reps):
	for t in xrange(start,stop):

		EFFECT_SIZE = 0 if t < 10 else -gain*n

		internal_influence = actors[0].calculate_intent_to_drink()
		actors[0].update({'attitude to medical consequences':EFFECT_SIZE})
		ap(actors[0].snapshot(as_dict=True,print_calc=True))
		ap(actors[0].inspect_calculation())
		effect[t] = actors[0].variables['attitude to medical consequences']
		intent[t] = actors[0].variables['intent to drink']

		attitudes[agent,t] = attitudes[agent,t-1] + epsilon*((effect-attitudes[agent,t-1]) if (effect-attitudes[agent,t-1]) > THRESHOLDS[agent] else 0)
		social_influence = attitudes[G.predecessors(agent),t-1].dot(influence_kernel[agent]) #kernel already normalized
		'''
		effect = (1-alpha[agent])*internal_influence + alpha[agent]*social_influence
		attitudes[agent,t] += (epsilon*(effect if effect > THRESHOLDS[agent] else 0))

		#update agent's drinking behavior
		drinking_behavior[agent,t] = G.node[agent]['actor'].variables['past month drinking']
		local_medical_attitudes = np.array([G.node[influencer]['actor'].variables['attitude to medical consequences'] for 
													influencer in G.predecessors(agent)]).dot(influence_kernel[agent])
Example #21
0
import json

import numpy as np
import utils as tech

from awesome_print import ap
from collections import Counter

db = json.load(open('../data/db.json', 'rb'))
drugs = open('../data/list_of_drugs', 'rb').read().splitlines()

#----- Descriptive Statistics
frac = len([x for x in db if len(db[x]["drugs"]) > 0]) / float(len(db))
ap("Fraction of db with recognized drugs: %.04f" % frac)

tally = dict(Counter(tech.flatten([db[entry]["drugs"] for entry in db])))
drugs_that_occur = {
    drug: tally[drug] if drug in tally else 0
    for drug in drugs
}

dto = drugs_that_occur.values()
with open('../data/drugs-that-actually-occurr', 'wb') as fid:
    for drug in tally:
        print >> fid, drug

json.dump(tally, open('../data/drugs-that-actually-occurr.json', 'wb'))

#--- What entries have no drugs?
'''
for entry in db:
Example #22
0
objects = [
        ["abc\ndef", {(1,2) : (1,2,3,4,5,6,7,8,9,10,11)}],
        None,
        BooleanType,
        True,
        "Hello, World!",
        65535,
        3.1415926,
        (1,2,3,4,5,6,7,8,9,10,11),
        [1,2,3,4,5],
        {'one': 1, 'two': 2, 'ten': 10},
        {
            'one': 1,
            'two': ['uno', 'dos', 'tres'],
            'six': {
                'ein': 1,
                'zwei': 2,
                'drei': 3
            }
        }
        ]

print '>> from awesome_print import ap'
for object in objects:
    if type(object) is str:
        print '>> ap("' + str(object) +'")'
    else:
        print '>> ap(' + str(object) +')'
    ap(object)
#-----INITIALIZE------------------------------------------
data = {}
directory = json.load(open(os.path.join(basepath,'directory.json'),READ))
for variable in directory:
	data[variable] = np.load(directory[variable]) if variable == 'complete record' else np.loadtxt(directory[variable],delimiter = TAB)


RESPONDER_FILENAME = os.path.join(basepath,'responders')
if not os.path.isfile(RESPONDER_FILENAME):

	responders = [agent for agent in xrange(data['complete record'].shape[1])
					if np.gradient(np.array_split(data['complete record'][:,agent,PAST_MONTH_DRINKING],3)[1]).mean()<0]

	np.savetxt(RESPONDER_FILENAME,responders,delimiter=TAB,fmt=INTEGER) 
	ap('%d Responders: %s'%(len(responders),' '.join(map(str,responders))))
	identified_responders = set(responders) & set(data['at-risk'])
	ap('%d Responders identified as at-risk: %s'%(len(identified_responders),map(str,identified_responders)))

else:
	responders = np.loadtxt(RESPONDER_FILENAME,delimiter=TAB)

overall_population = data['attitudes'].shape[0]
yes_response_yes_atrisk = len(set(responders) & set(data['at-risk']))
no_response_yes_atrisk = len(set(data['at-risk']) - set(responders))
no_response_no_atrisk =  len(set(range(overall_population)) - set(responders)-set(data['at-risk']))
yes_response_no_atrisk =  len(set(responders)-set(data['at-risk'])) 

#print contingency_table
table = Texttable()
table.set_cols_align(["r", "l","l","l"])
import couchdb, json, datetime

from time import time 
from awesome_print import ap 

timestamp = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d')
credentials = json.load(open('credentials.json'))

couch = couchdb.Server("https://yid.cloudant.com/")
couch.resource.credentials = (credentials['username'],credentials['password'])

with open('snapshot-%s'%timestamp,'w') as outfile:
	counter = 0 
	for id in couch['erowid']:
		if counter%100==0:
			ap(counter)
		record = couch['erowid'][id]
		print>>outfile,record['text'].strip().encode('utf-8')
		counter += 1 
		
from awesome_print import ap 
import numpy as np 
'''Calculate P (MI | {chest pain, male, 30})'''

test = {}
for key, unconditional_value in zip(["a","b","c","d"],[0.2,0.3,0.4,0.5]):
	test[key] = {}
	test[key]["unconditional"] = unconditional_value
	test[key]["a"] = unconditional_value + 0.1

def parse_query(dictionary,query):
	'''  P (a | {b,c,d})
		 = P(a|b) P(a|c) P(a|d) (by assumption)
		 = P(b|a)P(a)P(c|a)P(a)P(d|a)P(a) / P(b)P(c)P(d)
		 = P(b|a) P(c|a) P(d|a) P(a)^^(len(conditionals)) / P(b) P(c) P(d)
	'''
	query = query.strip()
	prior, conditionals = query.split('|')

	prior = prior.strip()
	conditionals = [x.strip() for x in conditionals.split(',')]

	return np.prod([dictionary[prior]["unconditional"] 
			if prior not in dictionary[conditional]
			else (dictionary[conditional][prior]* dictionary[prior]["unconditional"]/
				dictionary[conditional]["unconditional"])
			for conditional in conditionals if conditional in dictionary])

ap(parse_query(test,"a | b,c,d"))
Example #26
0
import json

from awesome_print import ap 
import numpy as np 


def iqr(data):
	return 0.5* (np.percentile(data,75)-np.percentile(data,25))
tally= json.load(open('../data/drugs-that-actually-occurr.json','rb'))


ap(tally)
ap(np.median(tally.values()))
ap(9287/float(len(tally)))
ap(iqr(tally.values()))
Example #27
0
    patho_one_ratings = np.array([
        i[0][0] if len(i[0]) > 0 else -1
        for i in df_one.apply(np.nonzero, axis=1).values
    ]).astype(int)
    patho_two_ratings = np.array([
        i[0][0] if len(i[0]) > 0 else -1
        for i in df_two.apply(np.nonzero, axis=1).values
    ]).astype(int)

    #Really inefficient implementation, but too many exceptions to vectorize:

    contingency_table = np.zeros((3, 3))

    for rating_one in patho_one_ratings:
        if type(rating_one) == type(list):
            rating_one = rating_one[0]
        for rating_two in patho_two_ratings:
            if type(rating_two) == type(list):
                rating_two = rating_two[0]
            print '\t %d' % rating_two
            contingency_table[rating_one, rating_two] += 1

    lvsi['%s-%s' % (pathologist_one,
                    pathologist_two)] = cohens_kappa(contingency_table).kappa

json.dump(lvsi, open('../data/lvsi-stains-grades.json', 'wb'))

ap(np.median(lvsi.values()))
print 0.5 * (np.percentile(lvsi.values(), 75) -
             np.percentile(lvsi.values(), 25))
Example #28
0
import numpy as np 

from collections import Counter
from awesome_print import ap 
TAB = '\t'
#-- Load effect matrix (rows=document,cols=effect)
effect_matrix = np.loadtxt('../data/effect-matrix.tsv',delimiter=TAB)
effects = open('../data/master-class-list').read().splitlines()
#--- Prevalence of effects

effect_prevalence = {drug:sum(effect_matrix[i,:]==1) 
							for i,drug in enumerate(effects)}

ap(sorted(effect_prevalence.items(),key=lambda item:item[1],reverse=True))
Example #29
0
	def check_consistency(self):
		self.all_nodes
		for source_node in self.all_nodes:
			# ap(source_node.nexts)
			# ap(source_node.previouses)
			for key, next_node in source_node.nexts.items():
				target_node = source_node.nexts[key]
				if target_node.previouses.get(key, None) != source_node:
					ap( "A bug problem: Next")
					ap( "{} {}".format(source_node.id, key, target_node.id))
					return 
				if target_node not in self.all_nodes:
					ap( "Suspicious New Node in way of nexts")
					ap( target_node.id)
					return
			for key, prev_node in source_node.previouses.items():
				target_node = source_node.previouses[key]
				if target_node.nexts.get(key, None) != source_node:
					ap( "A bug problem: Previous")
					ap( "{} {}".format(source_node.id, key, target_node.id))
					return None
				if target_node not in self.all_nodes:
					ap( "Suspicious New Node in way previouses")
					ap( target_node.id)
					return
Example #30
0
import json

from sys import argv
from awesome_print import ap 

filename = argv[1]


labels = json.load(open('../data/%s-matrix-kmeans-cluster-labels.json'%filename,'rb'))["4"]["labels"]

if filename != 'taxonomy':
	drugs = open('../data/master-drug-list','rb').read().splitlines()
	clusters = {label:[drug for i,drug in enumerate(drugs)
			if labels[i]==label] for label in set(labels)}

else: 
	effects = open('../data/taxonomy-classes','rb').read().splitlines()
	clusters={label:[effect for i,effect in enumerate(effects) 
						if labels[i]==label] for label in set(labels)}

json.dump(clusters, open("../data/drugs-in-each-cluster-%s.json"%filename,'wb'))
ap(clusters)
import numpy as np
import matplotlib.pyplot as plt

from awesome_print import ap

def unique_words(aStr):
	return ' '.join([word for word in set(aStr.split())])

TEXT = 1
basis_vectors = [unique_words(line.split(':')[TEXT]) for line in open('lda-topics.txt','rb').read().splitlines()]
stopwords = set(open('stopwords.txt').read().splitlines())
data = filter(lambda x: x != 'none',[shift['Student Comment'].lower().strip() for shift in csv.DictReader(open('comments.csv'))])
data = [' '.join([word for word in set(string.split()) if word not in stopwords]) for string in data]

ap(basis_vectors)

def jaccard_similarity(a,b):
	a = set(a)
	b = set(b)

	return len(a & b)/float(len(a | b))

def gs(X, row_vecs=True, norm = True):
	if not row_vecs:
		X = X.T
	Y = X[0:1,:].copy()
	for i in range(1, X.shape[0]):
		proj = np.diag((X[i,:].dot(Y.T)/np.linalg.norm(Y,axis=1)**2).flat).dot(Y)
		Y = np.vstack((Y, X[i,:] - proj.sum(0)))
	if norm:
Example #32
0
import json

import numpy as np

from awesome_print import ap
from sys import argv

nclus = argv[1]
labels = json.load(
    open('../data/effect-correlation-matrix-kmeans-cluster-labels.json',
         'rb'))[str(nclus)]["labels"]
#Only care about labels for two cluster case

effects = open('../data/master-class-list', 'rb').read().splitlines()
print len(effects)
print len(labels)
d = {
    cluster:
    [effect for i, effect in enumerate(effects) if labels[i] == int(cluster)]
    for cluster in np.unique(labels)
}
ap(d)
Example #33
0
import matplotlib.pyplot as plt 
import Graphics as artist

from sys import argv
from awesome_print import ap 
from matplotlib import rcParams

rcParams['text.usetex'] = True
filename = argv[1]

drugs = open('../data/master-drug-list','rb').read().splitlines()
m = np.loadtxt('../data/%s-matrix.tsv'%filename,delimiter='\t')

fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(m.ravel(),histtype='stepfilled',color='k',alpha=0.7)
artist.adjust_spines(ax)
ax.set_xlabel(artist.format(filename))
plt.yscale('log', nonposy='clip')
plt.tight_layout()
plt.savefig('../imgs/%s-matrix-distribution.png'%filename)

cutoff = np.percentile(m,85)
popular_combinations = {'%s-%s'%(one,two): m[i,j]
							for i,one in enumerate(drugs)
							for j,two in enumerate(drugs) 
							if m[i,j] > cutoff and j<i}

ap(sorted(popular_combinations.items(),key=lambda item:item[1],reverse=True))


pca = PCA(n_components=8,whiten=True)
X = pca.fit_transform(xm)
print pca.explained_variance_ratio_
nclus = 3

clustering = AgglomerativeClustering(linkage='ward', n_clusters=nclus)
Y = clustering.fit_predict(X)

d = {label:[effect for i,effect in enumerate(effects)
                    if clustering.labels_[i] == label] 
                for label in np.unique(clustering.labels_)}

ap(d)
ct = [sum(m[:,i]>0) for i in xrange(m.shape[0])]
ap(dict(zip(effects,ct)))
'''
colors = dict(zip(np.unique(clustering.labels_),
            brewer2mpl.get_map('Set2', 'qualitative',8).mpl_colors))

shapes = dict(zip(np.unique(clustering.labels_),['o','s','*','x','D']))
loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals

fig, axs = plt.subplots(ncols=2,sharex=True,sharey=True,figsize=(8,8))

axs[0].scatter(X[:,0],X[:,1],c=X[:,2],cmap=plt.cm.seismic,s=35, clip_on=False)
artist.adjust_spines(axs[0])
axs[0].set_xlabel(artist.format("PC 1"))
axs[0].set_ylabel(artist.format("PC 2"))
import couchdb, json, datetime

from time import time
from awesome_print import ap

timestamp = datetime.datetime.fromtimestamp(time()).strftime('%Y-%m-%d')
credentials = json.load(open('credentials.json'))

couch = couchdb.Server("https://yid.cloudant.com/")
couch.resource.credentials = (credentials['username'], credentials['password'])

with open('snapshot-%s' % timestamp, 'w') as outfile:
    counter = 0
    for id in couch['erowid']:
        if counter % 100 == 0:
            ap(counter)
        record = couch['erowid'][id]
        print >> outfile, record['text'].strip().encode('utf-8')
        counter += 1
Example #36
0
X = pca.fit_transform(xm)
print pca.explained_variance_ratio_
nclus = 3

clustering = AgglomerativeClustering(linkage='ward', n_clusters=nclus)
Y = clustering.fit_predict(X)

d = {
    label: [
        effect for i, effect in enumerate(effects)
        if clustering.labels_[i] == label
    ]
    for label in np.unique(clustering.labels_)
}

ap(d)
ct = [sum(m[:, i] > 0) for i in xrange(m.shape[0])]
ap(dict(zip(effects, ct)))
'''
colors = dict(zip(np.unique(clustering.labels_),
            brewer2mpl.get_map('Set2', 'qualitative',8).mpl_colors))

shapes = dict(zip(np.unique(clustering.labels_),['o','s','*','x','D']))
loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals

fig, axs = plt.subplots(ncols=2,sharex=True,sharey=True,figsize=(8,8))

axs[0].scatter(X[:,0],X[:,1],c=X[:,2],cmap=plt.cm.seismic,s=35, clip_on=False)
artist.adjust_spines(axs[0])
axs[0].set_xlabel(artist.format("PC 1"))
axs[0].set_ylabel(artist.format("PC 2"))
Example #37
0
import json, itertools
import utils as tech
from awesome_print import ap 
#Are there any standardized drug names not in the taxonomy?
READ = 'rb'

taxonomy = json.load(open('drug-taxonomy.json',READ))
standardized_names = json.load(open('drug_misnaming.json',READ))

if len(set(tech.flatten(standardized_names.values())) - set(taxonomy.keys())) == 0:
	ap("All standardized drug names are in the taxonomy.")
Example #38
0

db = json.load(open('../data/db.json', 'rb'))
effects = open('../data/master-effect-list', 'rb').read().splitlines()
taxonomy = json.load(open('../data/drug-taxonomy.json', 'rb'))


def process(effect, entry):
    effects = list(
        set([x for drug in entry["drugs"] for x in taxonomy[drug]["effects"]]))

    #Calling x so as not to shadow the argument EFFECT passed to function
    return 1 if effect in effects else -1


ap('Calculating occurrent matrix')
m = np.array([[process(effect, entry) for entry in db.values()]
              for effect in effects],
             dtype=int)

np.savetxt('../data/effect-matrix.tsv', m, fmt='%d', delimiter='\t')

tally = {effect: sum(m[i, :] == 1) for i, effect in enumerate(effects)}

json.dump(tally, open('../data/frequency-of-effects.json', 'wb'))
json.dump(
    {
        'expected occurrence': len(db) / float(len(effects)),
        'iqr': iqr(tally.values())
    }, open('../data/thresholds.json', 'wb'))
Example #39
0
stopwords = set(open('stopwords.txt').read().splitlines())
punkt = set(string.punctuation)

upper_half = json.load(open('upper_half.json','rb'))
lower_half = json.load(open('lower_half.json','rb'))

comments = list(csv.DictReader(open('comments.csv','rb')))

comments = [comment for comment in comments 
		if comment['Student Comment'] != 'None' and comment['Student Comment'] !='NA']

upper_half_comments = [comment for comment in comments if comment['Name'] in upper_half.keys()]
lower_half_comments = [comment for comment in comments if comment['Name'] in lower_half.keys()]

ap('Upper len :%d'%len(upper_half_comments))
ap('Lower len :%d'%len(lower_half_comments))

upper_half_vocabulary  =' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in upper_half_comments])))
lower_half_vocabulary = ' '.join(tech.cleanse(' '.join([comment['Student Comment'] for comment in lower_half_comments])))

upper_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(upper_half_vocabulary)) 
					if word not in punkt and word not in stopwords]
lower_half_words = [word.lower() for word in nltk.word_tokenize(to_ascii(lower_half_vocabulary)) 
					if word not in punkt and word not in stopwords]

upper_freqs = nltk.FreqDist(upper_half_words)
lower_freqs = nltk.FreqDist(lower_half_words)

print tech.weighted_jaccard_similarity(upper_freqs,lower_freqs)
Example #40
0
import json

from awesome_print import ap
import numpy as np


def iqr(data):
    return 0.5 * (np.percentile(data, 75) - np.percentile(data, 25))


tally = json.load(open('../data/drugs-that-actually-occurr.json', 'rb'))

ap(tally)
ap(np.median(tally.values()))
ap(9287 / float(len(tally)))
ap(iqr(tally.values()))
import json

import numpy as np 

from awesome_print import ap 
from sys import argv

nclus = argv[1]
labels = json.load(open('../data/effect-correlation-matrix-kmeans-cluster-labels.json','rb'))[str(nclus)]["labels"]
#Only care about labels for two cluster case

effects = open('../data/master-class-list','rb').read().splitlines()
print len(effects)
print len(labels)
d = {cluster:[effect for i,effect in enumerate(effects) if labels[i]==int(cluster)] 
		for cluster in np.unique(labels)}
ap(d)
Example #42
0
with open('comments.csv',READ) as csvfile:
	comments = [row for row in csv.DictReader(csvfile)]	

students =  {entry['Name']:"" for entry in comments}

def process(text):
	
	return ' '.join([nltk.stem.WordNetLemmatizer().lemmatize(word) for word in word_tokenize(text.decode('utf-8').encode('ascii','ignore').lower()) 
			if word not in stopwords and word not in ['x','na']])


for student in students:
	students[student] = process(' '.join([entry['Student Comment'] for entry in comments
								if entry['Name']==student]))

ap(students)
keys = [student for student in students.iterkeys() if len(students[student]) > 0]
tfx = TfidfVectorizer([students[student] for student in keys],tokenizer=word_tokenize,strip_accents='unicode',
	min_df=3, use_idf=True)

tfidf = tfx.fit_transform([students[student] for student in keys])

#--LSA
pca = PCA(n_components=3)
model = pca.fit_transform(tfidf.toarray())
ap(pca.explained_variance_ratio_)

#--lda
model = lda.LDA(n_topics=3, n_iter=1000,random_state=1)
model.fit(tfidf)
Example #43
0
def obfuscate_author(author_input_directory, output_file_path):

	global tokenizer
	global verb_tags
	global adjective_tags
	global noun_tags
	global adverb_tags
	global english_stopwords
	global stemmer	
	global is_american
	global american_to_british
	global british_to_american
	global british_american_dic
	global american_british_dic
	global vocabulary

	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	verb_tags = ["VB", "VBD", "VBG", "VBN", "VBP"]
	adjective_tags = ["JJ", "JJR", "JJS"]
	noun_tags = ["NN", "NNS"]
	adverb_tags = ["RB", "RBR", "RBS"]
	english_stopwords = stopwords.words('english')
	stemmer = PorterStemmer()

	obfuscation_document_content =  codecs.open(  os.path.join(author_input_directory, "original.txt") , "r", "utf-8").read().strip()
	lines = tokenizer.tokenize(obfuscation_document_content)

	position_tags = [(0,len(lines[0]))]
	for i in range(1,len(lines)):
		position_tags.append( ((position_tags[i-1][1] + 1) , (position_tags[i-1][1] + len(lines[i]) ) ) )


	lines = [line.replace("\r\n", " ").replace("\n", " ") for line in lines]
	obfuscated_lines = []

	vocabulary , average_document_length = (avg_length_and_vocab(author_input_directory))
	is_american = assign_american_or_brit(vocabulary, 'change.csv')
	british_to_american , american_to_british = read_dictionaries( 'change.csv')

	for line in lines:
		ap("About to obfuscate:")
		print(line)
		obfuscated_line = obfuscate(line)
		obfuscated_lines.append(obfuscated_line)
		ap("Obfuscated line is:")
		print(obfuscated_line)
		ap("--------------------------")

	output_dictionary = {}

	obfuscations = []

	for i in range(len(lines)):
		obfuscation = {}
		obfuscation["original"] = lines[i]
		obfuscation["original-start-charpos"] = position_tags[i][0]
		obfuscation["original-end-charpos"] = position_tags[i][1]
		obfuscation["obfuscation"] = obfuscated_lines[i]
		obfuscation["obfuscation-id"] = i+1
		obfuscations.append(obfuscation)	

	obfuscation_file = codecs.open(  output_file_path ,"w", "utf-8")
	obfuscation_file.write(json.dumps(obfuscations) )
	obfuscation_file.close()
def process(text):
	return ' '.join([nltk.stem.WordNetLemmatizer().lemmatize(word) for word in word_tokenize(text.decode('utf-8').encode('ascii','ignore').lower()) 
			if word not in stopwords and word not in ['x','na'] and word not in set(string.punctuation)])


for student in students:
	students[student] = process(' '.join([entry['Student Comment'] for entry in comments
								if entry['Name']==student]))

#ap(students)
keys = [student for student in students.iterkeys() if len(students[student]) > 0]
tfx = TfidfVectorizer([students[student] for student in keys],tokenizer=word_tokenize,strip_accents='unicode',
	min_df=3, use_idf=True)

tfidf = tfx.fit_transform([students[student] for student in keys])
lda = LatentDirichletAllocation(n_topics=8,max_iter=5,learning_method='online')
model = lda.fit_transform(tfidf)
'''
ap(dir(tfx))
ap(dict(zip(tfx.get_feature_names(),tfx.idf_)))
'''
#--lda
feature_names = tfx.get_feature_names()
feature_weights = tfx.idf_

n_top_words = 20
features = zip(feature_weights,feature_names)
with open('sklearn-lda-topics-w-weights','wb') as f:
	for topic_idx, topic in enumerate(lda.components_):
		print>>f, " : ".join(["%.4f * %s"%(features[i][0],features[i][1])
			for i in topic.argsort()[:-n_top_words - 1:-1]])
Example #45
0
import json 

import numpy as np 

from awesome_print import ap 

#threshold = .030888030888
#threshold /= (161*80) #Bonferroni correction
threshold =0.206349206349
ap(threshold)
READ = 'rb'
m = np.loadtxt('../data/drug-jaccard-similarity-actually-occurred.tsv',delimiter='\t')
drugs = open('../data/drugs-that-actually-occurr',READ).read().splitlines()
taxonomy = json.load(open('../data/drug-taxonomy.json',READ))


d = {'%s-%s'%(drugs[i],drugs[j]):m[i,j] 
		for i,j in zip(*np.where(np.tril(m,k=-1)>threshold))}

ap(d)
ap(len(d))
json.dump(d,open('../data/significantly-similar-pairs','wb'))
#Which mixing probabilities occur more than by chance?

#Prevalence of different classes
Example #46
0
	def sqeeze(self):
		all_merges_made = set()
		merges_made = 100
		while merges_made > 0:
			merges_made = 0
			for node_x in self.all_nodes:
				word_keys = map( lambda key: ParseForest.id_to_word_dictionary[int(key)] , node_x.nexts.keys() )
				for word_key, word_key_count in Counter(word_keys).items():
					if word_key_count > 1:
						keys = filter( lambda key: ParseForest.id_to_word_dictionary[int(key)] == word_key , node_x.nexts.keys())
						nodes = map( lambda key: node_x.nexts[key], keys)
						for node_a, node_b in itertools.combinations( nodes, 2):
							if node_a.id != node_b.id:
								intersection = [x for x in [node_a.id, node_b.id] if x in [self.start.id, self.end.id]]
								if not all_merges_made.__contains__( "-".join([str(node_a.id), str(node_b.id)])):
									if not self.is_remotely_connected(node_a, node_b):
										if node_b in self.all_nodes:
											self.merge_fsm_nodes(node_a, node_b)
											all_merges_made.add("-".join([str(node_a.id), str(node_b.id)]))
											all_merges_made.add("-".join([str(node_b.id), str(node_a.id)]))
											merges_made += 1
									elif self.is_connected(node_a, node_b):
										if node_b in node_a.nexts.values():
											word_id = ParseForest.next_unique_word_id("*e*")
											node_a.nexts[word_id] = node_b
											node_b.previouses[word_id] = node_a
											redundant_edge = filter( lambda key: node_x.nexts.get(key, None) == node_b, keys)
											if (len(redundant_edge) > 0):
												redundant_edge = redundant_edge[0]
												target_node = node_x.nexts.get(redundant_edge, None)

												if target_node:
													node_x.nexts.__delitem__(redundant_edge)
													target_node.previouses.__delitem__(redundant_edge)
										else:
											word_id = ParseForest.next_unique_word_id("*e*")
											node_b.nexts[word_id] = node_a
											node_a.previouses[word_id] = node_b
											redundant_edge = filter( lambda key: node_x.nexts.get(key, None) == node_a, keys)
											if (len(redundant_edge) > 0):
												redundant_edge = redundant_edge[0]
												target_node = node_x.nexts.get(redundant_edge, None)
												if target_node:
													node_x.nexts.__delitem__(redundant_edge)
													target_node.previouses.__delitem__(redundant_edge)
										all_merges_made.add("-".join([str(node_a.id), str(node_b.id)]))

		merges_made = 100
		while merges_made > 0:
			merges_made = 0
			for node_x in self.all_nodes:
				word_keys = map( lambda key: ParseForest.id_to_word_dictionary[int(key)] , node_x.previouses.keys() )
				for word_key, word_key_count in Counter(word_keys).items():
					if word_key_count > 1:
						keys = filter( lambda key: ParseForest.id_to_word_dictionary[int(key)] == word_key , node_x.previouses.keys())
						nodes = map( lambda key: node_x.previouses[key], keys)
						for node_a, node_b in itertools.combinations( nodes, 2):
							if node_a.id != node_b.id:
								intersection = [x for x in [node_a.id, node_b.id] if x in [self.start.id, self.end.id]]
								if not all_merges_made.__contains__( "-".join([str(node_a.id), str(node_b.id)])):
									if not self.is_remotely_connected(node_a, node_b):
										if node_b in self.all_nodes:
											self.merge_fsm_nodes(node_a, node_b)
											all_merges_made.add("-".join([str(node_a.id), str(node_b.id)]))
											all_merges_made.add("-".join([str(node_b.id), str(node_a.id)]))
											merges_made += 1
									elif self.is_connected(node_a, node_b):
										if node_b in node_a.nexts.values():
											word_id = ParseForest.next_unique_word_id("*e*")
											node_a.nexts[word_id] = node_b
											node_b.previouses[word_id] = node_a
											redundant_edge = filter( lambda key: node_x.previouses.get(key, None) == node_a, keys)
											if (len(redundant_edge) > 0):
												redundant_edge = redundant_edge[0]	
												target_node = node_x.previouses.get(redundant_edge, None)
												if target_node:
													node_x.previouses.__delitem__(redundant_edge)
													target_node.nexts.__delitem__(redundant_edge)
										else:
											word_id = ParseForest.next_unique_word_id("*e*")
											node_b.nexts[word_id] = node_a
											node_a.previouses[word_id] = node_b
											redundant_edge = filter( lambda key: node_x.previouses.get(key, None) == node_b, keys)
											if (len(redundant_edge) > 0):
												redundant_edge = redundant_edge[0]
												target_node = node_x.previouses.get(redundant_edge, None)
												if target_node:
													node_x.previouses.__delitem__(redundant_edge)
													target_node.nexts.__delitem__(redundant_edge)

										all_merges_made.add("-".join([str(node_a.id), str(node_b.id)]))
		ap(all_merges_made)
Example #47
0
	for rating_one,rating_two in zip(patho_one_ratings,patho_two_ratings):
		#print rating_one,rating_two
		if type(rating_one) == type(list):
			rating_one = rating_one[0]
		if type(rating_two) == type(list):
			rating_two = rating_two[0]
		#print i 
		contingency_table[j,rating_one,rating_two] += 1

	
   	kappas['%s-%s'%(pathologist_one,pathologist_two)] = cohens_kappa(contingency_table[j,:,:].squeeze()).kappa

print np.median(contingency_table,axis=0)
print 0.5*(np.percentile(contingency_table,75,axis=0) - np.percentile(contingency_table,25,axis=0))
json.dump(kappas,open('kappa-by-grade-no-ihc.json','wb'))
ap(np.median(kappas.values()))
print 0.5*(np.percentile(kappas.values(),75)-np.percentile(kappas.values(),25))

'''	
	df_one = pd.read_excel('stains.xls',pathologist,parse_cols=cols_with_grades, convert_float=False)
	df_two = pd.read_excel('no-stain.xls',pathologist,parse_cols=cols_with_grades, convert_float=False)

	patho_one_ratings = np.array([i[0][0] if len(i[0]) > 0 else -1 for i in df_one.apply(np.nonzero,axis=1).values]).astype(int)
	patho_two_ratings = np.array([i[0][0] if len(i[0]) > 0 else -1 for i in df_two.apply(np.nonzero,axis=1).values]).astype(int)

	#Really inefficient implementation, but too many exceptions to vectorize:

	contingency_table = np.zeros((3,3))

	for rating_one in patho_one_ratings:
		if type(rating_one) == type(list):
Example #48
0
import json

import numpy as np

from awesome_print import ap

#threshold = .030888030888
#threshold /= (161*80) #Bonferroni correction
threshold = 0.206349206349
ap(threshold)
READ = 'rb'
m = np.loadtxt('../data/drug-jaccard-similarity-actually-occurred.tsv',
               delimiter='\t')
drugs = open('../data/drugs-that-actually-occurr', READ).read().splitlines()
taxonomy = json.load(open('../data/drug-taxonomy.json', READ))

d = {
    '%s-%s' % (drugs[i], drugs[j]): m[i, j]
    for i, j in zip(*np.where(np.tril(m, k=-1) > threshold))
}

ap(d)
ap(len(d))
json.dump(d, open('../data/significantly-similar-pairs', 'wb'))
#Which mixing probabilities occur more than by chance?

#Prevalence of different classes
Example #49
0
#! /usr/bin/env python

import requests
from awesome_print import ap

if __name__ == '__main__':
    resp = requests.get('https://api.github.com/users/macro1/repos')
    ap(resp.json())
Example #50
0
labels,freqs = zip(*sorted(effect_tally.items(),key=lambda item:item[1],reverse=True)[:cutoff])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(freqs,'k--',linewidth=2)
artist.adjust_spines(ax)
ax.set_xticks(xrange(len(labels)))
ax.set_xticklabels(map(artist.format,labels),rotation='vertical')
ax.set_ylabel(artist.format('No. of mentions'))

plt.tight_layout()
plt.savefig('../imgs/effect-frequency.png')

del fig,ax 
'''
'''
#How often does each class occur in the database?
class_tally = dict(Counter(list(tech.flatten([taxonomy[drug]["class"] 
				for drug in tally.keys()]))))

ap(class_tally)
'''

for drug in taxonomy:
	if 'class' not in taxonomy[drug]:
		ap(drug) 
'''
labels,freqs = zip(*sorted(class_tally.items(),key=lambda item:item[1],reverse=True)[:cutoff])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(freqs,'k--',linewidth=2)