def load_genes(geneinter_file='', ignore_file='', squaring=True): ''' Loads all of the gene pairs and their corresponding interaction scores into memory. It also keeps a set of all genes for iterative purposes. There is some criteria for excluding genes from this process: 1) If an ignore gene list file is provided, any gene in that file is excluded from the set of genes used. 2) If an interaction score is zero, it is *KEPT* in the set of genes used to generate BPMs with an interaction score of 0. This gene information is then available at the 'geneint' module level, since they are both used pervasively throughout BPM generation. Finally, if we add the gene pair (g1, g2) with score S to the dictionary, then we'll also add (g2, g1) with score S to the dictionary. This increases memory usage but saves cpu cycles when looking up interaction scores. Basically, we force the dictionary to be a reflexive matrix. ''' ignore = set() if conf.ignore: if conf is not None: for line in gzipOpen(conf.ignore): ignore.add(line.strip()) else: for line in gzipOpen(ignore_file): ignore.add(line.strip()) if conf is not None: reader = csv.reader(gzipOpen(conf.geneinter), delimiter='\t') else: reader = csv.reader(gzipOpen(geneinter_file), delimiter='\t') for row in reader: g1, g2, intscore = row[0], row[1], row[2] genespace.add(g1) genespace.add(g2) # Ignore pairs where one or both genes are in the ignore gene list if g1 in ignore or g2 in ignore: continue # If there is no interaction score, force it to be 0 try: ginter = float(intscore) except ValueError: ginter = 0.0 if (conf is not None and conf.squaring) or squaring: if ginter < 0: ginter = - (ginter ** 2) else: ginter = ginter ** 2 gis[(g1, g2)] = ginter gis[(g2, g1)] = ginter genes.add(g1) genes.add(g2) parallel.inc_counter(parallel.costs['load_genes'])
def group_genes((i, g1)): ''' group_genes is applied to every gene, and a BPM is generated from *every* gene. In particular, given M happy bipartitions, generate a BPM where the first module contains all genes that appeared in the same set in the M bipartitions C% of the time and the second module contains all genes that appeared in the opposite set in the M bipartitions C% of the time. ''' mod1, mod2 = [], [] for g2 in geneinter.genes: # Count the number of times g2 is in the same set as g2 freqsame = sum([ 1 for A, B in happyparts if (g1 in A and g2 in A) or (g1 in B and g2 in B) ]) ratio = float(freqsame) / conf.M if ratio >= conf.C: mod1.append(g2) elif (1 - ratio) >= conf.C: mod2.append(g2) parallel.inc_counter() parallel.print_progress() return set(mod1), set(mod2)
def load_genes(geneinter_file='', ignore_file='', squaring=True): ''' Loads all of the gene pairs and their corresponding interaction scores into memory. It also keeps a set of all genes for iterative purposes. There is some criteria for excluding genes from this process: 1) If an ignore gene list file is provided, any gene in that file is excluded from the set of genes used. 2) If an interaction score is zero, it is *KEPT* in the set of genes used to generate BPMs with an interaction score of 0. This gene information is then available at the 'geneint' module level, since they are both used pervasively throughout BPM generation. Finally, if we add the gene pair (g1, g2) with score S to the dictionary, then we'll also add (g2, g1) with score S to the dictionary. This increases memory usage but saves cpu cycles when looking up interaction scores. Basically, we force the dictionary to be a reflexive matrix. ''' ignore = set() if conf.ignore: if conf is not None: for line in gzipOpen(conf.ignore): ignore.add(line.strip()) else: for line in gzipOpen(ignore_file): ignore.add(line.strip()) if conf is not None: reader = csv.reader(gzipOpen(conf.geneinter), delimiter='\t') else: reader = csv.reader(gzipOpen(geneinter_file), delimiter='\t') for row in reader: g1, g2, intscore = row[0], row[1], row[2] genespace.add(g1) genespace.add(g2) # Ignore pairs where one or both genes are in the ignore gene list if g1 in ignore or g2 in ignore: continue # If there is no interaction score, force it to be 0 try: ginter = float(intscore) except ValueError: ginter = 0.0 if (conf is not None and conf.squaring) or squaring: if ginter < 0: ginter = -(ginter**2) else: ginter = ginter**2 gis[(g1, g2)] = ginter gis[(g2, g1)] = ginter genes.add(g1) genes.add(g2) parallel.inc_counter(parallel.costs['load_genes'])
def group_genes((i, g1)): ''' group_genes is applied to every gene, and a BPM is generated from *every* gene. In particular, given M happy bipartitions, generate a BPM where the first module contains all genes that appeared in the same set in the M bipartitions C% of the time and the second module contains all genes that appeared in the opposite set in the M bipartitions C% of the time. ''' mod1, mod2 = [], [] for g2 in geneinter.genes: # Count the number of times g2 is in the same set as g2 freqsame = sum([1 for A, B in happyparts if (g1 in A and g2 in A) or (g1 in B and g2 in B)]) ratio = float(freqsame) / conf.M if ratio >= conf.C: mod1.append(g2) elif (1 - ratio) >= conf.C: mod2.append(g2) parallel.inc_counter() parallel.print_progress() return set(mod1), set(mod2)
def localmaxcut(m): ''' Generates a random bipartition and makes the bipartition 'happy' by applying 'Weighted-Flip' (from Leiserson et al., 2011) until there are no unhappy genes left. ''' A, B = random_bipartition() same_set = lambda g1, g2: (g1 in A and g2 in A) or (g1 in B and g2 in B) def weights(g1): ''' Calculates the total neighboring weight of 'g1'. The total neighboring weight is a tuple of the sum of interactions in the same set as g1 and the sum of interactions in the opposite set as g1. The tuple in this case is represented by a dictionary with keys 'same' and 'other'. I'm using a dictionary because the values need to be mutable; they change as we move vertices between the partitions. ''' ws = {'same': 0, 'other': 0} for g2 in geneinter.genes: w = geneinter.gi(g1, g2) if same_set(g1, g2): ws['same'] += w else: ws['other'] += w return ws nweights = {g: weights(g) for g in geneinter.genes} unhappy = get_unhappy(nweights) while unhappy: v = random.choice(unhappy) if v in A: A.remove(v) B.add(v) else: A.add(v) B.remove(v) # This loop eliminates the need to recalculate 'weights' for every # gene again, which is O(n^2) in the number of genes. This loop is # O(n) but comes at the cost of clarity. # # The idea is to modify the weights of every other interacting gene and # to switch the 'same' and 'other' scores of the gene that was made # happy. for g, nw in nweights.iteritems(): if g == v: nw['same'], nw['other'] = nw['other'], nw['same'] continue # The interaction score between this gene and the gene that # was made happy. w = geneinter.gi(v, g) # If the two genes are now in the same set, then 'g' gets a boost # to its happiness. Otherwise, 'g' becomes more unhappy. if same_set(v, g): nw['same'] += w nw['other'] -= w else: nw['same'] -= w nw['other'] += w # Refresh the unhappy list unhappy = get_unhappy(nweights) parallel.inc_counter() parallel.print_progress() return A, B
def enrich(modulecnt, (bpmi, modi, genes)): ''' Initiates a request to Funcassociate and returns a dictionary of goterms. :param modulecnt: The total number of modules in the BPM file. :param bpmi, modi, genes: A tuple representing a module. 'bpmi' is the BPM index number, 'modi' is the module index number, and 'genes' is a list of gene names in the module. :return: A four-tuple of the input module and its associated go terms. ''' goterms = faread.functionate(genes, min(10000, max(1000, modulecnt))) parallel.inc_counter() parallel.print_progress() return bpmi, modi, genes, goterms def sortgo(goterms): ''' Sorts the keys of a goterms dictionary according to the current configuration. ''' if conf is None: reverse = False sort_by = 'p' else: reverse = conf.order_go == 'desc'
from bpm import conf, faread, parallel def enrich(modulecnt, (bpmi, modi, genes)): ''' Initiates a request to Funcassociate and returns a dictionary of goterms. :param modulecnt: The total number of modules in the BPM file. :param bpmi, modi, genes: A tuple representing a module. 'bpmi' is the BPM index number, 'modi' is the module index number, and 'genes' is a list of gene names in the module. :return: A four-tuple of the input module and its associated go terms. ''' goterms = faread.functionate(genes, min(10000, max(1000, modulecnt))) parallel.inc_counter() parallel.print_progress() return bpmi, modi, genes, goterms def sortgo(goterms): ''' Sorts the keys of a goterms dictionary according to the current configuration. ''' if conf is None: reverse = False sort_by = 'p' else: reverse = conf.order_go == 'desc' sort_by = conf.sort_go_by
def localmaxcut(m): ''' Generates a random bipartition and makes the bipartition 'happy' by applying 'Weighted-Flip' (from Leiserson et al., 2011) until there are no unhappy genes left. ''' A, B = random_bipartition() same_set = lambda g1, g2: (g1 in A and g2 in A) or (g1 in B and g2 in B) def weights(g1): ''' Calculates the total neighboring weight of 'g1'. The total neighboring weight is a tuple of the sum of interactions in the same set as g1 and the sum of interactions in the opposite set as g1. The tuple in this case is represented by a dictionary with keys 'same' and 'other'. I'm using a dictionary because the values need to be mutable; they change as we move vertices between the partitions. ''' ws = { 'same': 0, 'other': 0 } for g2 in geneinter.genes: w = geneinter.gi(g1, g2) if same_set(g1, g2): ws['same'] += w else: ws['other'] += w return ws nweights = { g: weights(g) for g in geneinter.genes } unhappy = get_unhappy(nweights) while unhappy: v = random.choice(unhappy) if v in A: A.remove(v) B.add(v) else: A.add(v) B.remove(v) # This loop eliminates the need to recalculate 'weights' for every # gene again, which is O(n^2) in the number of genes. This loop is # O(n) but comes at the cost of clarity. # # The idea is to modify the weights of every other interacting gene and # to switch the 'same' and 'other' scores of the gene that was made # happy. for g, nw in nweights.iteritems(): if g == v: nw['same'], nw['other'] = nw['other'], nw['same'] continue # The interaction score between this gene and the gene that # was made happy. w = geneinter.gi(v, g) # If the two genes are now in the same set, then 'g' gets a boost # to its happiness. Otherwise, 'g' becomes more unhappy. if same_set(v, g): nw['same'] += w nw['other'] -= w else: nw['same'] -= w nw['other'] += w # Refresh the unhappy list unhappy = get_unhappy(nweights) parallel.inc_counter() parallel.print_progress() return A, B