def groundtruth(curated, gt_type, verbose=True): # open gt files, compute if necessary filename = filename_groundtruth.format('_curated' if curated else '', gt_type) assert gt_type in groundtruth_types if not os.path.exists(filename): _make_groundtruth(curated=curated) return CausalArray.load(file=filename)
def gt(self, name, successful_ints=False, change_strint_name=False): """Groundtruth measures as a CausalArray. Args: name (str): type of groundtruth Returns: CausalArray: groundtruth """ causes = self.mutants effects = self.genes if name in ('gt_abs', 'gt.abs', 'abs'): array = self.gt_abs units = 'j' elif name in ('gt_rel', 'gt.rel', 'rel'): array = self.gt_rel units = 'j/i' elif name in ('gt_abs_norm', 'gt.abs.norm', 'abs_norm', 'abs.norm'): array = self.gt_abs_norm units = '1' elif name in ('gt_abs_norm_robust', 'gt.abs.norm.robust', 'abs_norm_robust', 'abs.norm.robust'): array = self.gt_abs_norm_robust units = '1' elif name in ('gt_rel_norm', 'gt.rel.norm', 'rel_norm', 'rel.norm'): array = self.gt_rel_norm units = '1' elif name in ('gt_nature', 'gt.nature', 'nature'): array = self.gt_nature units = '1' elif name in ('gt_sie', 'gt.sie', 'sie'): array = self.gt_sie units = '0' else: raise Exception('Unknown groundtruth %s!' % name) ## only select successfull ints, put the others to zero OR smaller # if successful_ints: # array[[self.map_mutants[i] for i in list(set(self.mutants).difference(set(self.successful_ints)))],:] = 0 if successful_ints: causes = self.successful_ints if len(causes) == 0: raise Exception('No successful interventions found!') array = array[[self.map_mutants[i] for i in causes], ] if change_strint_name: name += '_strint' return CausalArray(array=array, causes=causes, effects=effects, units=units, name=name)
def _make_groundtruth(curated, round_size=None, verbose=True): """ Query SGD and compile groundtruth.""" start_time = time.time() if verbose: print 'Querying yeastmine for %s' % 'manually curated' if curated else 'all' + 'interactions.' # Start intermine service yeastmine = Service('http://yeastmine.yeastgenome.org/yeastmine/service') # 1. Get a list of all genes in quick query. Note: these are unicode formated. query = yeastmine.new_query('Gene') query.add_view('secondaryIdentifier') query.add_sort_order('Gene.secondaryIdentifier', 'ASC') query.add_constraint('Gene', 'IN', 'ALL_Verified_Uncharacterized_Dubious_ORFs', code='A') # create genes list and map genes_list = [] min_length = 100 for row in query.rows(): gene = str(row['secondaryIdentifier']) genes_list.append(str(gene)) if len(gene) < min_length: min_length = len(gene) if verbose: print 'Gene char length min:', min_length genes_map = dict(zip(genes_list, range(len(genes_list)))) if verbose: print 'Number of genes: {}'.format( len(genes_list) ) # should be 6604 for ALL_Verified_Uncharacterized_Dubious_ORFs # Query for interactions query = yeastmine.new_query('Gene') query.add_view('secondaryIdentifier', 'interactions.details.type', 'interactions.participant2.secondaryIdentifier', 'interactions.details.role1') #query.add_view('secondaryIdentifier', 'interactions.details.type', 'interactions.details.role1') query.add_sort_order('Gene.secondaryIdentifier', 'ASC') query.add_constraint('organism.shortName', '=', 'S. cerevisiae', code='B') query.add_constraint('Gene', 'IN', 'ALL_Verified_Uncharacterized_Dubious_ORFs', code='A') if curated: query.add_constraint('interactions.details.annotationType', '=', 'manually curated', code='C') # Matrices adjacency_matrix_genetic = np.zeros((len(genes_list), len(genes_list)), dtype='bool') adjacency_matrix_physical = np.zeros((len(genes_list), len(genes_list)), dtype='bool') # Add to correct adjacency matrix current_gene = None for row in query.rows(): if (current_gene != str(row['secondaryIdentifier'])): current_gene = str(row['secondaryIdentifier']) print 'Querying {} - {}'.format(genes_map[current_gene], current_gene) # # Test genes, print all queried info # if (current_gene == u'YAL008W'): # or current_gene == u'Q0085' or current_gene == 'Q0105'): # if row['interactions.details.role1'] == u'Bait': # print row['secondaryIdentifier'], '<--' , row['interactions.details.type'], '---', row['interactions.participant2.secondaryIdentifier'] # else: # print row['secondaryIdentifier'], '---' , row['interactions.details.type'], '-->', row['interactions.participant2.secondaryIdentifier'] # Fill adjacency matrices for physical and genetic relationships if (str(row['interactions.participant2.secondaryIdentifier']) in genes_list): if (row['interactions.details.type'] == u'physical interactions'): if row['interactions.details.role1'] == u'Bait': # Bait <--- Hit relationship found. # maybe check if the reverse already exists? adjacency_matrix_physical[genes_map[row['interactions.participant2.secondaryIdentifier']],\ genes_map[str(row['secondaryIdentifier'])]] = True elif row['interactions.details.type'] == u'genetic interactions': if row['interactions.details.role1'] == u'Bait': # Bait <--- Hit relationship found. adjacency_matrix_genetic[genes_map[row['interactions.participant2.secondaryIdentifier']],\ genes_map[str(row['secondaryIdentifier'])]] = True if verbose: print 'Nuber of genetic interactions {}'.format( adjacency_matrix_genetic.sum()) print 'Nuber of physical interactions {}'.format( adjacency_matrix_physical.sum()) # save all data + write date in each. CausalArray(array=adjacency_matrix_genetic, causes=genes_list, name='sgd_gen{}'.format('_cur' if curated else '')).save( file=filename_groundtruth.format( '_curated' if curated else '', 'genetic')) # write_datestamp_hdf5(filename_groundtruth.format('_curated' if curated else '', 'genetic')) CausalArray(array=adjacency_matrix_physical, causes=genes_list, name='sgd_phys{}'.format('_cur' if curated else '')).save( file=filename_groundtruth.format( '_curated' if curated else '', 'physical')) # write_datestamp_hdf5(filename_groundtruth.format('_curated' if curated else '', 'physical')) CausalArray(array=np.add(adjacency_matrix_genetic, adjacency_matrix_physical), causes=genes_list, name='sgd_all{}'.format('_cur' if curated else '')).save( file=filename_groundtruth.format( '_curated' if curated else '', 'all')) # write_datestamp_hdf5(filename_groundtruth.format('_curated' if curated else '', 'all')) if verbose: print 'Done in {:.2f}'.format(time.time() - start_time)
def gt_inclusive_genetic(): # return groundtruth(curated=False, gt_type='genetic') return CausalArray.load(file=filename_groundtruth.format('', 'genetic'))
def gt_inclusive_physical(): # return groundtruth(curated=False, gt_type='physical') return CausalArray.load(file=filename_groundtruth.format('', 'physical'))
def gt_curated_genetic(): # return groundtruth(curated=True, gt_type='genetic') return CausalArray.load( file=filename_groundtruth.format('_curated', 'genetic'))
def gt_curated_physical(): # return groundtruth(curated=True, gt_type='physical') return CausalArray.load( file=filename_groundtruth.format('_curated', 'physical'))