def findKthLargest(self, nums, k): """ :type nums: List[int] :type k: int :rtype: int """ # #O(Nlogk)>>insert a element>logk>do it N times # #O(k)>>queue size # if not nums or not k: # return False # #minheap by default # q = PriorityQueue() # for num in nums: # q.put(num) # #maintain qsize k # if q.qsize()>k: # q.get() # return q.get() if not nums: return 0 heap = [] for i in nums: heapq.heappush(heap, i) if len(heap) > k: heapq.heappop(heap) return heapq.heappop(heap)
def find_hits(self, tensor, opts): # ~~ Former UI FUNCTION ~~ # Big idea: get select, best ganglion hits, given the tensor of hits. # This returns a nice list of some desired ganglions in the tensor. The idea is that # the parameter opts will allow a way for the user to communicate exactly what they # want in a single keyword. # Options: - int for the top <int> results # - str of an int for that percentile, in % # - float for that percentile, in face value if isinstance(opts,int) or isinstance(opts,float) or isinstance(opts,str): # Tenth percentile gets the top 10th percentile of hits, ignoring edge points. # Format: tuple( -hitscore, if isinstance(opts,str): try: opts = float(opts)/100 except: return gang_window = self.gang_window() heap = [] for t1, T1 in enumerate(tensor): for t2, T2 in enumerate(T1): for t3, T3 in enumerate(T2): if T3[1] == gang_window[t1]: heapq.heappush(heap, (-T3[0],T3[1],t1,t2,t3)) x = [(heapq.heappop(heap)) for i in range(len(heap))] for i in range(len(x)): x[i] = tuple([-x[i][0]]+list(x[i][1:])) if opts == 'tenth_percentile' or opts == '10': percentile = int(math.ceil(len(x)/10.0)) elif isinstance(opts, float): percentile = int(math.ceil(len(x)*opts)) return x[0:percentile] elif isinstance(opts, int): return x[0:opts]
def astar(start, h, d=None): explored = set() open = [] hq.heappush(open, Node(start)) while open: current = hq.heappop(open) # used for getting table data if d >= 2: if current.g == d: if isGoal(current): return [current, len(open) + len(explored) + 1] else: return None if isGoal(current): return [current, len(open) + len(explored) + 1] explored.add(str(current)) for succ in successors(current): if str(succ) not in explored: succ.g = current.g + 1 succ.f = succ.g + h(succ) succ.parent = current hq.heappush(open, succ) return None
def astar(start): pq = [] closed = set() hq.heappush(pq, [start.heuristic(), 0, start, []]) while pq: f, g, cur, path = hq.heappop(pq) if cur.isGoal(): return g, path closed.add(str(cur)) for child, move in cur.moves(): if str(child) not in closed: p = deepcopy(path) p.append(move) hq.heappush(pq, [g + 1 + child.heuristic(), g + 1, child, p]) return None
def _get_next_forwarder(self): """ Get next available forwarder. It will accumulate forwarder load and return forwarder with minimal load. """ with self._forwarders_lock.reader_lock: try: forwarder_load = heapq.heappop(self._available_forwarderloads) forwarder = forwarder_load.forwarder forwarder_load.load += 1 heapq.heappush(self._available_forwarderloads, forwarder_load) return forwarder except IndexError: raise DispatchEngineException("No available forwarders")
def get(self, url, max_last_time): """ proxy is the form http://8.8.8.8:8000 if self._pool not all in self._table[domain]: add one of the difference to self._table[domain] else: pop item with lowest last_time from priority queue, compare last proxied time, if no proxy available, put item back, return None. else, put item back with now time, update self._table[domain][proxy], return proxy """ domain = urlparse.urlparse(url).netloc proxies_table = self._table[domain] now = time.time() if not self._pool: return rest_proxies = self._pool.difference(set(proxies_table.keys())) if len(rest_proxies) == 0: try: item = heapq.heappop(proxies_table['priority']) last_time, count, proxy = item if max_last_time < last_time: heapq.heappush(proxies_table['priority'], item) return _count = self._count_rule('get', count) proxies_table[proxy] = [now, _count] heapq.heappush(proxies_table['priority'], [now, _count, proxy]) return proxy except IndexError: print('priority queue is empty.') else: count = 0 _count = self._count_rule('get', count) proxy = rest_proxies.pop() proxies_table[proxy] = [now, _count] if 'priority' not in proxies_table: proxies_table['priority'] = [] heapq.heappush(proxies_table['priority'], [now, _count, proxy]) return proxy
def main(): parser = OptionParser() #(usage=usage) parser.add_option( "--InpFile", dest="InpFile", help= "Input interaction file. Assumed that the first six fields contain two interacting chromosome information." ) parser.add_option( "--headerInp", dest="headerInp", type="int", help= "If 1, indicates that input interaction file has a header line (such as field names). Default 1." ) parser.add_option("--OutFile", dest="OutFile", help="Output merged interaction file") parser.add_option("--binsize", dest="binsize", type="int", help="Size of bins employed. DEFAULT 5 Kb.") parser.add_option("--conn", dest="connectivity_rule", type="int", help="Rule of connectivity ( 8 or 4). DEFAULT 8.") parser.add_option( "--percent", dest="TopPctElem", type="int", help= "Percentage of elements to be selected from each connected component. Default: 100, means all loops would be considered. If specified as 0, only the most significant loops from each component would be selected. For any number x between 0 and 100, top x% of the loops in a component, considering both statistical significance and contact count, would considered for inclusion, subject to the bin and neighborhood contraints." ) parser.add_option( "--Neigh", dest="NeighborHoodBin", type="int", help= "Positive integer (default: 2 with 5 Kb bin size) means that if a loop is included in the final set, loops involving within 2x2 neighborhood of both the bins would be discarded. Applicable only if --percent > 0. Difference in bin size other than 5000 may prompt user to change this value." ) parser.add_option( "--cccol", dest="CCCol", type="int", help="Column number storing the contact count. Default: 7.") parser.add_option( "--qcol", dest="QValCol", type="int", help= "Column number storing the q-value (or any measure of statistical significance). Default: 0, means the last column of the given interaction file. Any non-zero value would prompt the user to check the corresponding column." ) parser.add_option( "--pcol", dest="PValCol", type="int", help= "Column number storing the p-value (or any measure of statistical significance). Default: 0, means the second last column of the given interaction file. Any non-zero value would prompt the user to check the corresponding column." ) parser.add_option( "--order", dest="SortOrder", type="int", help= "Binary variable indicating the sorting order of the given significance values. Default 0, means sorting is done by ascending order. If specified 1, sorting is done by descending (reverse) order." ) parser.set_defaults(InpFile=None, OutFile=None, binsize=5000, connectivity_rule=8, headerInp=1, TopPctElem=100, NeighborHoodBin=2, QValCol=0, PValCol=0, SortOrder=0, CCCol=7) (options, args) = parser.parse_args() #=========================== # process the input parameters #=========================== if options.InpFile is not None: InpFile = options.InpFile else: sys.exit("Input file is not provided - quit !!") # output file storing the bed formatted interactions if options.OutFile is not None: OutFile = options.OutFile else: sys.exit("Output file is not specified - quit !!") bin_size = int(options.binsize) headerInp = int(options.headerInp) connectivity_rule = int(options.connectivity_rule) TopPctElem = int(options.TopPctElem) NeighborHoodBinThr = (int(options.NeighborHoodBin)) * bin_size # parameters regarding significance and sorting order of statistical significance values QValCol = int(options.QValCol) PValCol = int(options.PValCol) CCCol = int(options.CCCol) SortOrder = int(options.SortOrder) #==================== # fix the columns containing P and Q-values # by reading the first line of the input interaction file fp_in = open(InpFile, 'r') l = fp_in.readline() contents = l.rstrip().split() if (QValCol == 0): QValCol = len(contents) if (PValCol == 0): PValCol = (len(contents) - 1) fp_in.close() #==================== # print the parameters if 1: print '****** Merge filtering of adjacent loops is enabled *****' print '***** within function of merged filtering - printing the parameters ***' print '*** bin_size: ', bin_size print '*** headerInp: ', headerInp print '*** connectivity_rule: ', connectivity_rule print '*** TopPctElem: ', TopPctElem print '*** NeighborHoodBinThr: ', NeighborHoodBinThr print '*** QValCol: ', QValCol print '*** PValCol: ', PValCol print '*** SortOrder: ', SortOrder # open the output file # if input interaction file has header information, # then dump the header in the output file as well fp_outInt = open(OutFile, 'w') if (headerInp == 1): fp_in = open(InpFile, 'r') l = fp_in.readline() contents = l.rstrip().split() # write the header corresponding to the chromosomes, contact count, P value and the Q value fp_outInt.write(contents[0] + '\t' + contents[1] + '\t' + contents[2] + '\t' + contents[3] + '\t' + contents[4] + '\t' + contents[5] + '\t' + contents[CCCol - 1] + '\t' + contents[PValCol - 1] + '\t' + contents[QValCol - 1] + '\t' + 'bin1_low' + '\t' + 'bin1_high' + '\t' + 'bin2_low' + '\t' + 'bin2_high' + '\t' + 'sumCC' + '\t' + 'StrongConn') fp_in.close() else: fp_outInt.write('chr1' + '\t' + 'start1' + '\t' + 'end1' + '\t' + 'chr2' + '\t' + 'start2' + '\t' + 'end2' + '\t' + 'CC' + '\t' + 'p' + '\t' + 'fdr' + '\t' + 'bin1_low' + '\t' + 'bin1_high' + '\t' + 'bin2_low' + '\t' + 'bin2_high' + '\t' + 'sumCC' + '\t' + 'StrongConn') # list of chromosomes to be experimented TargetChrList = [] for i in range(1, 23): curr_chr = 'chr' + str(i) TargetChrList.append(curr_chr) TargetChrList.append('chrX') TargetChrList.append('chrY') # output directory OutDir = os.path.dirname(os.path.realpath(OutFile)) if 1: print 'OutDir: ', str(OutDir) #========================================= # loop to process individual chromosomes and corresponding data #========================================= for chridx in range(len(TargetChrList)): curr_chr = TargetChrList[chridx] if 1: print 'Processing the chromosome: ', str(curr_chr) # extract the interactions of current chromosome from the complete set of interactions tempchrdumpfile = OutDir + '/Temp_chr_Dump.bed' if (headerInp == 1): awkcmd = "cat " + str( InpFile) + " | awk \'{if (NR>1 && $1==\"" + str( curr_chr) + "\" && $4==\"" + str( curr_chr) + "\"){print $0}}\' - > " + str( tempchrdumpfile) else: awkcmd = "cat " + str(InpFile) + " | awk \'{if ($1==\"" + str( curr_chr) + "\" && $4==\"" + str( curr_chr) + "\"){print $0}}\' - > " + str(tempchrdumpfile) os.system(awkcmd) # check the number of dumped interactions num_Int = sum(1 for line in open(tempchrdumpfile)) if (num_Int == 0): if 0: print 'Number of interactions for this chromosome = 0 --- continue' continue if 0: print 'Extracted interactions for the current chromosome' # # extract also the max span of interactions (6th column maximum element) # # so as to estimate the matrix size # temp_log_file = OutDir + '/Temp.log' # sys_cmd = "cat " + str(tempchrdumpfile) + " | cut -f6 | sort -nr > " + str(temp_log_file) # os.system(sys_cmd) # # determine the maximum coordinate # with open(temp_log_file, 'r') as fp_in: # l = fp_in.readline() # max_coord = int((l.rstrip()).split()[0]) # sys.stdout.flush() # test # # number of bins (matrix dimension) # nbins = (max_coord / bin_size) # if 0: # print 'max_coord of the interactions: ', str(max_coord) # print 'nbins: ', str(nbins) # create a graph which will store the interactions G = nx.Graph() # create a dictionary for storing the interactions CurrChrDict = dict() # now scan through the interactions of the extracted chromosome # and create a dictionary whose keys are the interacting bin numbers with open(tempchrdumpfile, 'r') as fp: for line in fp: linecontents = (line.rstrip()).split() # we set the bin number according to the end coordinate bin1 = int(linecontents[2]) / bin_size bin2 = int(linecontents[5]) / bin_size if (bin1 < bin2): curr_key = (bin1, bin2) else: curr_key = (bin2, bin1) # assign the key to the dictionary # add the contact count, P value and Q value information as well CurrChrDict.setdefault( curr_key, Interaction(int(linecontents[CCCol - 1]), float(linecontents[PValCol - 1]), float(linecontents[QValCol - 1]))) # add the node to the given graph as well G.add_node(curr_key) if 0: print 'Current interaction: ', str( line), ' ------ curr_key: ', curr_key # now check the nodes of G # assign edges of G according to the 8 / 4 connectivity rule (according to the input parameter) nodelist = list(G.nodes()) for i in range(len(nodelist) - 1): node1 = nodelist[i] for j in range(i + 1, len(nodelist)): node2 = nodelist[j] if 0: print 'Checking the edge between node 1: ', node1, ' and node 2: ', node2 # check if there should be an edge between node1 and node2 # according to the desired connectivity rule if (connectivity_rule == 8): if (abs(node1[0] - node2[0]) <= 1) and (abs(node1[1] - node2[1]) <= 1): G.add_edge(node1, node2) if 0: print '8 connectivity Edge between node 1: ', node1, ' and node 2: ', node2 if (connectivity_rule == 4): if ((abs(node1[0] - node2[0]) + abs(node1[1] - node2[1])) <= 1): G.add_edge(node1, node2) if 0: print '4 connectivity Edge between node 1: ', node1, ' and node 2: ', node2 # check the edges of G edgelist = list(G.edges()) if 1: print 'No of nodes of G: ', G.number_of_nodes() print 'No of edges of G: ', G.number_of_edges() print 'Number of connected components of G: ', nx.number_connected_components( G) # scan through individual connected components # for each such connected component (list of interactions) # we find a representative interaction and print it in the final output file list_conn_comp = sorted(nx.connected_components(G), key=len, reverse=True) if 0: print '\n\n**** Number of connected components: ', len( list_conn_comp), ' ****\n\n' #==================== # process individual connected components #==================== for i in range(len(list_conn_comp)): # a connected component - a particular list of connected nodes curr_comp_list = list(list_conn_comp[i]) if 0: print '\n\n\n ===>>>>> Processing the connected component no: ', i, ' list: ', str( curr_comp_list), ' number of elements: ', len( curr_comp_list) # from the first interacting bin set, get the lower and higher bin index min_idx_bin1 = min([x[0] for x in curr_comp_list]) max_idx_bin1 = max([x[0] for x in curr_comp_list]) if 0: print 'min_idx_bin1: ', min_idx_bin1, ' max_idx_bin1: ', max_idx_bin1 # get the span of coordinates for the first interacting bin (set) span_low_bin1 = (min_idx_bin1 - 1) * bin_size span_high_bin1 = max_idx_bin1 * bin_size if 0: print 'span_low_bin1: ', span_low_bin1, ' span_high_bin1: ', span_high_bin1 # from the second interacting bin set, get the lower and higher bin index min_idx_bin2 = min([x[1] for x in curr_comp_list]) max_idx_bin2 = max([x[1] for x in curr_comp_list]) if 0: print 'min_idx_bin2: ', min_idx_bin2, ' max_idx_bin2: ', max_idx_bin2 # get the span of coordinates for the first interacting bin (set) span_low_bin2 = (min_idx_bin2 - 1) * bin_size span_high_bin2 = max_idx_bin2 * bin_size if 0: print 'span_low_bin2: ', span_low_bin2, ' span_high_bin2: ', span_high_bin2 # sum of contact counts for all the interacting bins # within this set of connected nodes sum_cc = sum([CurrChrDict[x]._GetCC() for x in curr_comp_list]) # now get the percentage of bin pairs within this set of connected component # having a significant interaction total_possible_bin_pairs = (max_idx_bin1 - min_idx_bin1 + 1) * (max_idx_bin2 - min_idx_bin2 + 1) possible_bin_pairs = 0 for b1 in range(min_idx_bin1, (max_idx_bin1 + 1)): for b2 in range(min_idx_bin2, (max_idx_bin2 + 1)): bin_pair_key = (b1, b2) if bin_pair_key in CurrChrDict: possible_bin_pairs = possible_bin_pairs + 1 # % of bin pairs within the region spanned by this connected component # having significant interaction # the higher the %, the better this component is strongly connected Percent_Significant_BinPair = (possible_bin_pairs * 1.0) / total_possible_bin_pairs if 0: print ' ==>>> total_possible_bin_pairs: ', total_possible_bin_pairs, ' possible_bin_pairs: ', possible_bin_pairs, ' % clique: ', Percent_Significant_BinPair #================================================== # approach 1 : # if TopPctElem = 0 then # get the bin having maximum statistical significance and corresponding bin pairs # ties are resolved by maximum contact count # if SortOrder = 0, maximum statistical significance === min P and Q values # if SortOrder = 1, maximum statistical significance === max P and Q (equivalent) measures #================================================== if (TopPctElem == 0): for j in range(len(curr_comp_list)): curr_key = curr_comp_list[j] curr_cc = CurrChrDict[curr_key]._GetCC() curr_pval = CurrChrDict[curr_key]._GetPVal() curr_qval = CurrChrDict[curr_key]._GetQVal() curr_key_bin1_mid = (((curr_key[0] - 1) * bin_size) + (curr_key[0] * bin_size)) / 2 curr_key_bin2_mid = (((curr_key[1] - 1) * bin_size) + (curr_key[1] * bin_size)) / 2 if 0: print ' Connected component index: ', j, ' curr_key: ', curr_key, ' bin 1 mid: ', curr_key_bin1_mid, ' bin 2 mid: ', curr_key_bin2_mid, ' CC: ', curr_cc, ' Pval: ', curr_pval, ' Qval: ', curr_qval if (j == 0): # first index rep_bin_key = curr_key elif (SortOrder == 0) and ( curr_pval < CurrChrDict[rep_bin_key]._GetPVal() ) and (curr_qval < CurrChrDict[rep_bin_key]._GetQVal()): # current element has higher statistical significance (lower P or Q value when SortOrder = 0) rep_bin_key = curr_key elif (SortOrder == 1) and ( curr_pval > CurrChrDict[rep_bin_key]._GetPVal() ) and (curr_qval > CurrChrDict[rep_bin_key]._GetQVal()): # current element has higher statistical significance (higher P or Q value when SortOrder = 1) rep_bin_key = curr_key elif (curr_pval == CurrChrDict[rep_bin_key]._GetPVal()) and ( curr_qval == CurrChrDict[rep_bin_key]._GetQVal() ) and (curr_cc > CurrChrDict[rep_bin_key]._GetCC()): # current element has equal P and Q values # but higher contact count rep_bin_key = curr_key # fix the representative interaction rep_bin1_low = (rep_bin_key[0] - 1) * bin_size rep_bin1_high = rep_bin_key[0] * bin_size rep_bin2_low = (rep_bin_key[1] - 1) * bin_size rep_bin2_high = rep_bin_key[1] * bin_size cc = CurrChrDict[rep_bin_key]._GetCC() pval = CurrChrDict[rep_bin_key]._GetPVal() qval = CurrChrDict[rep_bin_key]._GetQVal() if 0: print '**** Selected bin key: ', rep_bin_key, ' start bin mid: ', ( rep_bin1_low + rep_bin1_high) / 2, ' end bin mid: ', ( rep_bin2_low + rep_bin2_high ) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval # write the interaction in the specified output file fp_outInt.write('\n' + str(curr_chr) + '\t' + str(rep_bin1_low) + '\t' + str(rep_bin1_high) + '\t' + str(curr_chr) + '\t' + str(rep_bin2_low) + '\t' + str(rep_bin2_high) + '\t' + str(cc) + '\t' + str(pval) + '\t' + str(qval) + '\t' + str(span_low_bin1) + '\t' + str(span_high_bin1) + '\t' + str(span_low_bin2) + '\t' + str(span_high_bin2) + '\t' + str(sum_cc) + '\t' + str(Percent_Significant_BinPair)) #================================================== # approach 2: # if TopPctElem > 0, and TopPctElem < 100 # then get the top (TopPctElem %) elements from # each connected component, (P and Q values, contact counts) # use these elements if they satisfy the bin neighborhood threshold #================================================== if (TopPctElem > 0) and (TopPctElem < 100): # list to store the q-values and the CC values for individual bin pairs # for their sequential extraction Curr_Comp_Tuple_List = [] # lists storing different attributes curr_conn_comp_CCList = [] curr_conn_comp_QValList = [] # process individual elements within this connected component for j in range(len(curr_comp_list)): curr_key = curr_comp_list[j] curr_cc = CurrChrDict[curr_key]._GetCC() curr_pval = CurrChrDict[curr_key]._GetPVal() curr_qval = CurrChrDict[curr_key]._GetQVal() curr_conn_comp_CCList.append(curr_cc) curr_conn_comp_QValList.append(curr_qval) # create a min-heap structure # first element: q-value # if SortOrder = 0, lower: better: use the same sign when insering in the heap # if SortOrder = 1, higher: better: reverse the sign when insering in the heap # for ties, second element (contact count) - higher: better # so, use negative signs if (SortOrder == 0): subl = [ curr_qval, ((-1) * curr_cc), curr_key[0], curr_key[1] ] else: subl = [((-1) * curr_qval), ((-1) * curr_cc), curr_key[0], curr_key[1]] # insert the element in the designated queue heapq.heappush(Curr_Comp_Tuple_List, subl) # first get the maximum / minimum values from these lists max_cc = max(curr_conn_comp_CCList) min_qval = min(curr_conn_comp_QValList) # now obtain the values of top K % elements # from these lists # where K = 50 means it is median custom_cc = custom_percent(curr_conn_comp_CCList, TopPctElem, 2) custom_qval = custom_percent(curr_conn_comp_QValList, TopPctElem, (SortOrder + 1)) if 0: print ' --> current connected component: max CC: ', max_cc, ' min Q val: ', min_qval, ' top K (TopPctElem): ', TopPctElem, ' custom_cc threshold: ', custom_cc, ' custom_qval threshold: ', custom_qval # this list stores the candidate interactions # from this particular connected component # that will be used in the final set of interactions Final_Rep_Key_List = [] # now extract elements from the constructed queue while (len(Curr_Comp_Tuple_List) > 0): curr_elem = heapq.heappop(Curr_Comp_Tuple_List) if 0: print 'extracted element from heap: ', curr_elem #=================================== # earlier condition - 1 - sourya # consider only those interactions # which have sufficient values of both contact count # and q-values # # terminating condition - do not consider elements # # with lower log 10 Q values than the custom_logqval # if ((curr_elem[0] * (-1)) < custom_logqval): # break # # coninue if the contact count falls below the designated threshold # if ((curr_elem[1] * (-1)) < custom_cc): # continue #=================================== # modified condition - sourya # consider those interactions having # significance value > K percentile if ((SortOrder == 0) and (curr_elem[0] > custom_qval)) or ( (SortOrder == 1) and (curr_elem[0] < custom_qval)): break #=================================== # if this is the first element # then insert they key in the candidate set of interactions if (len(Final_Rep_Key_List) == 0): subl = [curr_elem[2], curr_elem[3]] Final_Rep_Key_List.append(subl) if 0: print '\t\t *** inserted element in the final list: ', str( subl), ' generated Final_Rep_Key_List: ', str( Final_Rep_Key_List) continue # otherwise, check with the existing interactions # and do not include if the bin falls within a certain # neighborhood of earlier included interactions # the neighborhood is already mentioned via command line parameters flag = False for i in range(len(Final_Rep_Key_List)): # both ends of the bins should be within neighborhood thresholds # of existing contacts if (((abs(Final_Rep_Key_List[i][0] - curr_elem[2])) * bin_size) <= NeighborHoodBinThr) and (( (abs(Final_Rep_Key_List[i][1] - curr_elem[3])) * bin_size) <= NeighborHoodBinThr): flag = True if 0: print ' --- current element is within neighborhood of the bins indexed by ', i, ' of Final_Rep_Key_List' break if (flag == False): # there is no such neighborhood constraints # include the bin subl = [curr_elem[2], curr_elem[3]] Final_Rep_Key_List.append(subl) if 0: print '\t\t *** inserted element in the final list: ', str( subl), ' generated Final_Rep_Key_List: ', str( Final_Rep_Key_List) # now print the candidate interactions # of the current component for i in range(len(Final_Rep_Key_List)): rep_bin_key = (Final_Rep_Key_List[i][0], Final_Rep_Key_List[i][1]) # fix the representative interaction rep_bin1_low = (rep_bin_key[0] - 1) * bin_size rep_bin1_high = rep_bin_key[0] * bin_size rep_bin2_low = (rep_bin_key[1] - 1) * bin_size rep_bin2_high = rep_bin_key[1] * bin_size cc = CurrChrDict[rep_bin_key]._GetCC() pval = CurrChrDict[rep_bin_key]._GetPVal() qval = CurrChrDict[rep_bin_key]._GetQVal() if 1: print '**** Selected bin key: ', rep_bin_key, ' start bin mid: ', ( rep_bin1_low + rep_bin1_high ) / 2, ' end bin mid: ', ( rep_bin2_low + rep_bin2_high ) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval # write the interaction in the specified output file fp_outInt.write('\n' + str(curr_chr) + '\t' + str(rep_bin1_low) + '\t' + str(rep_bin1_high) + '\t' + str(curr_chr) + '\t' + str(rep_bin2_low) + '\t' + str(rep_bin2_high) + '\t' + str(cc) + '\t' + str(pval) + '\t' + str(qval) + '\t' + str(span_low_bin1) + '\t' + str(span_high_bin1) + '\t' + str(span_low_bin2) + '\t' + str(span_high_bin2) + '\t' + str(sum_cc) + '\t' + str(Percent_Significant_BinPair)) #================================================== # approach 3: # if TopPctElem = 100 (latest implementation) # then sequentially obtain the interactions with the lowest q-value # and break the tie for the higher contact count # use these elements if they satisfy the bin neighborhood threshold #================================================== if (TopPctElem == 100): # list to store the q-values and the CC values for individual bin pairs # for their sequential extraction Curr_Comp_Tuple_List = [] # lists storing different attributes curr_conn_comp_CCList = [] curr_conn_comp_QValList = [] # process individual elements within this connected component for j in range(len(curr_comp_list)): curr_key = curr_comp_list[j] curr_cc = CurrChrDict[curr_key]._GetCC() curr_qval = CurrChrDict[curr_key]._GetQVal() curr_conn_comp_CCList.append(curr_cc) curr_conn_comp_QValList.append(curr_qval) # create a min-heap structure # first element: q-value # if SortOrder = 0, lower: better: use the same sign when insering in the heap # if SortOrder = 1, higher: better: reverse the sign when insering in the heap # for ties, second element (contact count) - higher: better # so, use negative signs if (SortOrder == 0): subl = [ curr_qval, ((-1) * curr_cc), curr_key[0], curr_key[1] ] else: subl = [((-1) * curr_qval), ((-1) * curr_cc), curr_key[0], curr_key[1]] # insert the element in the designated queue (min-heap property) heapq.heappush(Curr_Comp_Tuple_List, subl) # this list stores the candidate interactions # from this particular connected component # that will be used in the final set of interactions Final_Rep_Key_List = [] if 0: print ' **** Processing the connected component ===== number of elements: ', len( Curr_Comp_Tuple_List) # now extract elements from the constructed queue while (len(Curr_Comp_Tuple_List) > 0): # extract the first element from the min-heap # element with the lowest value curr_elem = heapq.heappop(Curr_Comp_Tuple_List) if 0: print 'extracted element from heap: ', curr_elem # if this is the first element # then insert they key in the candidate set of interactions if (len(Final_Rep_Key_List) == 0): subl = [curr_elem[2], curr_elem[3]] Final_Rep_Key_List.append(subl) if 0: print '*** inserted element in the final list: ', str( subl) continue # otherwise, check with the existing interactions # and do not include if the bin falls within a certain # neighborhood of earlier included interactions # the neighborhood is already mentioned via command line parameters flag = False for i in range(len(Final_Rep_Key_List)): # both ends of the bins should be within neighborhood thresholds # of existing contacts if (((abs(Final_Rep_Key_List[i][0] - curr_elem[2])) * bin_size) <= NeighborHoodBinThr) and (( (abs(Final_Rep_Key_List[i][1] - curr_elem[3])) * bin_size) <= NeighborHoodBinThr): flag = True if 0: print ' --- current element is within neighborhood of the existing (included) bin ', Final_Rep_Key_List[ i] break if (flag == False): # there is no such neighborhood constraints # include the bin subl = [curr_elem[2], curr_elem[3]] Final_Rep_Key_List.append(subl) if 0: print '*** inserted element in the final list: ', str( subl) # now print the candidate interactions # of the current component if 0: print '\n\n**** Printing selected loops of the connected component ***\n\n' for i in range(len(Final_Rep_Key_List)): rep_bin_key = (Final_Rep_Key_List[i][0], Final_Rep_Key_List[i][1]) # fix the representative interaction rep_bin1_low = (rep_bin_key[0] - 1) * bin_size rep_bin1_high = rep_bin_key[0] * bin_size rep_bin2_low = (rep_bin_key[1] - 1) * bin_size rep_bin2_high = rep_bin_key[1] * bin_size cc = CurrChrDict[rep_bin_key]._GetCC() pval = CurrChrDict[rep_bin_key]._GetPVal() qval = CurrChrDict[rep_bin_key]._GetQVal() if 0: print 'Selected bin key: ', rep_bin_key, ' start bin mid: ', ( rep_bin1_low + rep_bin1_high ) / 2, ' end bin mid: ', ( rep_bin2_low + rep_bin2_high ) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval # write the interaction in the specified output file fp_outInt.write('\n' + str(curr_chr) + '\t' + str(rep_bin1_low) + '\t' + str(rep_bin1_high) + '\t' + str(curr_chr) + '\t' + str(rep_bin2_low) + '\t' + str(rep_bin2_high) + '\t' + str(cc) + '\t' + str(pval) + '\t' + str(qval) + '\t' + str(span_low_bin1) + '\t' + str(span_high_bin1) + '\t' + str(span_low_bin2) + '\t' + str(span_high_bin2) + '\t' + str(sum_cc) + '\t' + str(Percent_Significant_BinPair)) #================================================== # after processing all the chromosomes, now close the output interaction file fp_outInt.close() # # remove the temp log file which stores the max coordinate for a chromosome # sys_cmd = "rm " + str(temp_log_file) # os.system(sys_cmd) # remove the temporary chromosome specific interaction dump file sys_cmd = "rm " + str(tempchrdumpfile) os.system(sys_cmd) if 1: print '==================== End of merge filtering adjacent interactions !!! ======================'
def get(self): """ Remove and return the smallest item from the queue """ smallest = heapq.heappop(self.heap) del self.set[smallest] return smallest
def aStarSearch(self, distance): try: path_queue = [] visited_list = [] count = float(0) base = float(10000000000) heapq.heappush(path_queue, (1, self)) temp, node = heapq.heappop(path_queue) visited_list.append(node.state) #print(f'This is visited set{visited_list}') if self != None: while node.checkEnd() == False: #while count < 100: node.move() if node.left != None: if node.left.state not in visited_list: count += 1 node.left.updateCosthistory( node.cost_history + node.pathcost(node.left)) heapq.heappush( path_queue, (node.left.cost_history + node.left.heuristics(distance, end_position) + count / base, node.left)) else: #print(f'Visited {node.left.state}') pass if node.right != None: if node.right.state not in visited_list: count += 1 node.right.updateCosthistory( node.cost_history + node.pathcost(node.right)) heapq.heappush(path_queue, ( node.right.cost_history + node.right.heuristics(distance, end_position) + count / base, node.right)) else: #print(f'Visited {node.right.state}') pass if node.up != None: if node.up.state not in visited_list: count += 1 node.up.updateCosthistory(node.cost_history + node.pathcost(node.up)) heapq.heappush( path_queue, (node.up.cost_history + node.up.heuristics(distance, end_position) + count / base, node.up)) else: #print(f'Visited {node.up.state}') pass if node.down != None: if node.down.state not in visited_list: count += 1 node.down.updateCosthistory( node.cost_history + node.pathcost(node.down)) heapq.heappush( path_queue, (node.down.cost_history + node.down.heuristics(distance, end_position) + count / base, node.down)) else: #print(f'Visited {node.down.state}') pass temp, node = heapq.heappop(path_queue) visited_list.append(node.state) #if count >100000: # return None #print (f'layer {count}') #print(f'This is visited set{visited_list}') return node except: return None
def uniformCostSearch(self): try: path_queue = [] visited_list = [] count = float(0) base = float(10000000000) heapq.heappush(path_queue, (1, self)) temp, node = heapq.heappop(path_queue) visited_list.append(node.state) #print(f'This is visited set{visited_list}') if self != None: while node.checkEnd() == False: #while count < 100: node.move() if node.left != None: if node.left.state not in visited_list: count += 1 heapq.heappush(path_queue, (node.pathcost(node.left) + count / base, node.left)) else: #print(f'Visited {node.left.state}') pass if node.right != None: if node.right.state not in visited_list: count += 1 heapq.heappush(path_queue, (node.pathcost(node.right) + count / base, node.right)) else: #print(f'Visited {node.right.state}') pass if node.up != None: if node.up.state not in visited_list: count += 1 heapq.heappush(path_queue, (node.pathcost(node.up) + count / base, node.up)) else: #print(f'Visited {node.up.state}') pass if node.down != None: if node.down.state not in visited_list: count += 1 heapq.heappush(path_queue, (node.pathcost(node.down) + count / base, node.down)) else: #print(f'Visited {node.down.state}') pass temp, node = heapq.heappop(path_queue) visited_list.append(node.state) #print (f'layer {count}') #print(f'This is visited set{visited_list}') return node except: return None
def findKthLargest(self, nums, k): from Queue import heapq heapq.heapify(nums) sorted_heap = [heapq.heappop(nums) for _ in xrange(len(nums))] return sorted_heap[-k]
def search(self, query, return_length=100, passage_len=50, return_urls_only=False): ''' Performs search on loaded data. Returns list of sorted by rank: * tuples (url, rank) if return_urls_only == False * url if return_urls_only == True ''' query = query.strip() words = filter(lambda x: x != '', query.split(" ")) result = None if len(words) == 0: return [] word_index = [None] * len(words) for z, word in enumerate(words): word = self.norm(word.decode('utf-8').strip()) if word in self.dictionary: self.index.seek(self.dictionary[word], 0) compressed = self.index.readline().strip() decompressed = None if self.encoding == VARBYTE: decompressed = decode_varbyte(base64.b64decode(compressed)) elif self.encoding == SIMPLE9: decompressed = decode_simple9(base64.b64decode(compressed)) decompressed, word_index[z] = from_flat(decompressed) if result == None: result = decompressed else: result = join(result, decompressed) k1 = 2 b = 0.75 if result == None or len(result) == 0: return [] #Now we have a list of candidates. We apply BM25 to leave only return_length of them avg_len = 0. j = 0 while word_index[j] == None: j += 1 for i in xrange(len(result)): if result[i] in word_index[j]: avg_len += word_index[j][result[i]][0] avg_len /= len(result) BM25 = [0] * len(result) for j in xrange(len(words)): if word_index[j] != None: idf = log(float(self.N) / len(word_index[j])) for i in xrange(len(result)): if result[i] in word_index[j]: tf = float(len(word_index[j][result[i]][1])) / word_index[j][result[i]][0] BM25[i] += tf * idf / (tf + k1 * (b + word_index[j][result[i]][0] / avg_len * (1 - b))) if len(result) > return_length: tpr = [(x, y) for x, y in zip(BM25, result)] heap = tpr[:return_length] heapq.heapify(heap) for rank, ind in tpr[return_length:]: if heapq.nsmallest(1, heap)[0][0] < rank: heapq.heappop(heap) heapq.heappush(heap, (rank, ind)) result = [ind for rank, ind in heap] #Now we have a shortened list of candidates. We apply passage algorithm to leave top maxPASSpass scores = [0] * len(result) for i in xrange(len(result)): passage = [] for j in xrange(len(words)): if word_index[j] != None and result[i] in word_index[j]: passage.extend([(x, j) for x in word_index[j][result[i]][1]]) passage.sort() l = 0 r = 0 features = [0] * 5 for l in xrange(len(passage)): for r in xrange(l, len(passage)): if passage[r][0] - passage[l][0] + 1 > passage_len: continue passage_w = [x[1] for x in passage[l:r+1]] features[0] = len(set([x[1] for x in passage[l:r+1]])) / float(len(words)) features[1] = 1 - float(passage[l][0]) / word_index[passage[l][1]][result[i]][0] features[2] = 1 - float(r - l + 1) / (passage[r][0] - passage[l][0] + 1) features[3] = 0 for j in xrange(len(words)): if word_index[j] != None: idf = log(float(self.N) / len(word_index[j])) / log(self.N) tf = float(passage_w.count(j)) / (passage[r][0] - passage[l][0] + 1) features[3] += tf * idf features[4] = 0 for j in xrange(len(passage_w)-1): for k in xrange(j + 1, len(passage_w)): if passage_w[j] > passage_w[k]: features[4] += 1 if len(passage_w) != 1: features[4] /= float(len(passage_w) * (len(passage_w) - 1) / 2) score = reduce(lambda x,y: x + y, features) if score > scores[i]: scores[i] = score final_result = [] for score, url_id in sorted(zip(scores, result), reverse=True): final_result.append((url_id, self.urls[url_id], score)) if return_urls_only: final_result = [x[:-1] for x in final_result] return final_result