Exemple #1
0
 def findKthLargest(self, nums, k):
     """
     :type nums: List[int]
     :type k: int
     :rtype: int
     """
    # #O(Nlogk)>>insert a element>logk>do it N times
    #  #O(k)>>queue size
    #  if not nums or not k:
    #      return False
    #  #minheap by default
    #  q = PriorityQueue()
    #  for num in nums:
    #      q.put(num)
    #      #maintain qsize k
    #      if q.qsize()>k:
    #          q.get()
    #  return q.get()
     
     if not nums:
         return 0
     heap = []
     for i in nums:
         heapq.heappush(heap, i)
         if len(heap) > k:
             heapq.heappop(heap)
     return heapq.heappop(heap)
Exemple #2
0
	def find_hits(self, tensor, opts):
		# 	~~  Former UI FUNCTION  ~~  
	#	  Big idea: get select, best ganglion hits, given the tensor of hits.
		# This returns a nice list of some desired ganglions in the tensor.  The idea is that
		# the parameter opts will allow a way for the user to communicate exactly what they
		# want in a single keyword.
		# Options:  - int for the top <int> results
		#			- str of an int for that percentile, in %
		#			- float for that percentile, in face value
		if isinstance(opts,int) or isinstance(opts,float) or isinstance(opts,str):
			# Tenth percentile gets the top 10th percentile of hits, ignoring edge points.
			# Format: tuple( -hitscore,
			if isinstance(opts,str):
				try:
					opts = float(opts)/100
				except:
					return
			gang_window = self.gang_window()
			heap = []
			for t1, T1 in enumerate(tensor):
				for t2, T2 in enumerate(T1):
					for t3, T3 in enumerate(T2):
						if T3[1] == gang_window[t1]:
							heapq.heappush(heap, (-T3[0],T3[1],t1,t2,t3))
			x = [(heapq.heappop(heap)) for i in range(len(heap))]
			for i in range(len(x)):
				x[i] = tuple([-x[i][0]]+list(x[i][1:]))
			if opts == 'tenth_percentile' or opts == '10': 
				percentile = int(math.ceil(len(x)/10.0))
			elif isinstance(opts, float):
				percentile = int(math.ceil(len(x)*opts))
				return x[0:percentile]
			elif isinstance(opts, int):
				return x[0:opts]
def astar(start, h, d=None):
    explored = set()
    open = []
    hq.heappush(open, Node(start))

    while open:
        current = hq.heappop(open)

        # used for getting table data
        if d >= 2:
            if current.g == d:
                if isGoal(current):
                    return [current, len(open) + len(explored) + 1]
                else:
                    return None

        if isGoal(current):
            return [current, len(open) + len(explored) + 1]
        explored.add(str(current))

        for succ in successors(current):
            if str(succ) not in explored:
                succ.g = current.g + 1
                succ.f = succ.g + h(succ)
                succ.parent = current
                hq.heappush(open, succ)
    return None
def astar(start):
    pq = []
    closed = set()
    hq.heappush(pq, [start.heuristic(), 0, start, []])
    while pq:
        f, g, cur, path = hq.heappop(pq)
        if cur.isGoal():
            return g, path
        closed.add(str(cur))
        for child, move in cur.moves():
            if str(child) not in closed:
                p = deepcopy(path)
                p.append(move)
                hq.heappush(pq, [g + 1 + child.heuristic(), g + 1, child, p])
    return None
    def _get_next_forwarder(self):
        """
        Get next available forwarder. It will accumulate forwarder load
        and return forwarder with minimal load.
        """

        with self._forwarders_lock.reader_lock:
            try:
                forwarder_load = heapq.heappop(self._available_forwarderloads)
                forwarder = forwarder_load.forwarder
                forwarder_load.load += 1
                heapq.heappush(self._available_forwarderloads, forwarder_load)
                return forwarder
            except IndexError:
                raise DispatchEngineException("No available forwarders")
def astar(start):
    pq = []
    closed = set()
    hq.heappush(pq, [start.heuristic(), 0, start, []])
    while pq:
        f, g, cur, path = hq.heappop(pq)
        if cur.isGoal():
            return g, path
        closed.add(str(cur))
        for child, move in cur.moves():
            if str(child) not in closed:
                p = deepcopy(path)
                p.append(move)
                hq.heappush(pq, [g + 1 + child.heuristic(), g + 1, child, p])
    return None
Exemple #7
0
    def get(self, url, max_last_time):
        """ proxy is the form http://8.8.8.8:8000

            if self._pool not all in self._table[domain]:
                add one of the difference to self._table[domain]
            else:
                pop item with lowest last_time from priority queue,
                compare last proxied time,
                if no proxy available, put item back, return None.
                else, put item back with now time, update self._table[domain][proxy], return proxy
        """
        domain = urlparse.urlparse(url).netloc
        proxies_table = self._table[domain]
        now = time.time()

        if not self._pool:
            return
        rest_proxies = self._pool.difference(set(proxies_table.keys()))

        if len(rest_proxies) == 0:
            try:
                item = heapq.heappop(proxies_table['priority'])
                last_time, count, proxy = item
                if max_last_time < last_time:
                    heapq.heappush(proxies_table['priority'], item)
                    return

                _count = self._count_rule('get', count)
                proxies_table[proxy] = [now, _count]
                heapq.heappush(proxies_table['priority'], [now, _count, proxy])
                return proxy
            except IndexError:
                print('priority queue is empty.')
        else:
            count = 0
            _count = self._count_rule('get', count)
            proxy = rest_proxies.pop()
            proxies_table[proxy] = [now, _count]

            if 'priority' not in proxies_table:
                proxies_table['priority'] = []
            heapq.heappush(proxies_table['priority'], [now, _count, proxy])
            return proxy
Exemple #8
0
def main():
    parser = OptionParser()  #(usage=usage)
    parser.add_option(
        "--InpFile",
        dest="InpFile",
        help=
        "Input interaction file. Assumed that the first six fields contain two interacting chromosome information."
    )
    parser.add_option(
        "--headerInp",
        dest="headerInp",
        type="int",
        help=
        "If 1, indicates that input interaction file has a header line (such as field names). Default 1."
    )
    parser.add_option("--OutFile",
                      dest="OutFile",
                      help="Output merged interaction file")
    parser.add_option("--binsize",
                      dest="binsize",
                      type="int",
                      help="Size of bins employed. DEFAULT 5 Kb.")
    parser.add_option("--conn",
                      dest="connectivity_rule",
                      type="int",
                      help="Rule of connectivity ( 8 or 4). DEFAULT 8.")
    parser.add_option(
        "--percent",
        dest="TopPctElem",
        type="int",
        help=
        "Percentage of elements to be selected from each connected component. Default: 100, means all loops would be considered. If specified as 0, only the most significant loops from each component would be selected. For any number x between 0 and 100, top x% of the loops in a component, considering both statistical significance and contact count, would considered for inclusion, subject to the bin and neighborhood contraints."
    )
    parser.add_option(
        "--Neigh",
        dest="NeighborHoodBin",
        type="int",
        help=
        "Positive integer (default: 2 with 5 Kb bin size) means that if a loop is included in the final set, loops involving within 2x2 neighborhood of both the bins would be discarded. Applicable only if --percent > 0. Difference in bin size other than 5000 may prompt user to change this value."
    )
    parser.add_option(
        "--cccol",
        dest="CCCol",
        type="int",
        help="Column number storing the contact count. Default: 7.")
    parser.add_option(
        "--qcol",
        dest="QValCol",
        type="int",
        help=
        "Column number storing the q-value (or any measure of statistical significance). Default: 0, means the last column of the given interaction file. Any non-zero value would prompt the user to check the corresponding column."
    )
    parser.add_option(
        "--pcol",
        dest="PValCol",
        type="int",
        help=
        "Column number storing the p-value (or any measure of statistical significance). Default: 0, means the second last column of the given interaction file. Any non-zero value would prompt the user to check the corresponding column."
    )
    parser.add_option(
        "--order",
        dest="SortOrder",
        type="int",
        help=
        "Binary variable indicating the sorting order of the given significance values. Default 0, means sorting is done by ascending order. If specified 1, sorting is done by descending (reverse) order."
    )
    parser.set_defaults(InpFile=None,
                        OutFile=None,
                        binsize=5000,
                        connectivity_rule=8,
                        headerInp=1,
                        TopPctElem=100,
                        NeighborHoodBin=2,
                        QValCol=0,
                        PValCol=0,
                        SortOrder=0,
                        CCCol=7)
    (options, args) = parser.parse_args()

    #===========================
    # process the input parameters
    #===========================
    if options.InpFile is not None:
        InpFile = options.InpFile
    else:
        sys.exit("Input file is not provided - quit !!")

    # output file storing the bed formatted interactions
    if options.OutFile is not None:
        OutFile = options.OutFile
    else:
        sys.exit("Output file is not specified - quit !!")

    bin_size = int(options.binsize)
    headerInp = int(options.headerInp)
    connectivity_rule = int(options.connectivity_rule)
    TopPctElem = int(options.TopPctElem)
    NeighborHoodBinThr = (int(options.NeighborHoodBin)) * bin_size

    # parameters regarding significance and sorting order of statistical significance values
    QValCol = int(options.QValCol)
    PValCol = int(options.PValCol)
    CCCol = int(options.CCCol)
    SortOrder = int(options.SortOrder)

    #====================
    # fix the columns containing P and Q-values
    # by reading the first line of the input interaction file
    fp_in = open(InpFile, 'r')
    l = fp_in.readline()
    contents = l.rstrip().split()
    if (QValCol == 0):
        QValCol = len(contents)

    if (PValCol == 0):
        PValCol = (len(contents) - 1)

    fp_in.close()
    #====================

    # print the parameters
    if 1:
        print '****** Merge filtering of adjacent loops is enabled *****'
        print '***** within function of merged filtering - printing the parameters ***'
        print '*** bin_size: ', bin_size
        print '*** headerInp: ', headerInp
        print '*** connectivity_rule: ', connectivity_rule
        print '*** TopPctElem: ', TopPctElem
        print '*** NeighborHoodBinThr: ', NeighborHoodBinThr
        print '*** QValCol: ', QValCol
        print '*** PValCol: ', PValCol
        print '*** SortOrder: ', SortOrder

    # open the output file
    # if input interaction file has header information,
    # then dump the header in the output file as well
    fp_outInt = open(OutFile, 'w')

    if (headerInp == 1):
        fp_in = open(InpFile, 'r')
        l = fp_in.readline()
        contents = l.rstrip().split()
        # write the header corresponding to the chromosomes, contact count, P value and the Q value
        fp_outInt.write(contents[0] + '\t' + contents[1] + '\t' + contents[2] +
                        '\t' + contents[3] + '\t' + contents[4] + '\t' +
                        contents[5] + '\t' + contents[CCCol - 1] + '\t' +
                        contents[PValCol - 1] + '\t' + contents[QValCol - 1] +
                        '\t' + 'bin1_low' + '\t' + 'bin1_high' + '\t' +
                        'bin2_low' + '\t' + 'bin2_high' + '\t' + 'sumCC' +
                        '\t' + 'StrongConn')
        fp_in.close()
    else:
        fp_outInt.write('chr1' + '\t' + 'start1' + '\t' + 'end1' + '\t' +
                        'chr2' + '\t' + 'start2' + '\t' + 'end2' + '\t' +
                        'CC' + '\t' + 'p' + '\t' + 'fdr' + '\t' + 'bin1_low' +
                        '\t' + 'bin1_high' + '\t' + 'bin2_low' + '\t' +
                        'bin2_high' + '\t' + 'sumCC' + '\t' + 'StrongConn')

    # list of chromosomes to be experimented
    TargetChrList = []
    for i in range(1, 23):
        curr_chr = 'chr' + str(i)
        TargetChrList.append(curr_chr)
    TargetChrList.append('chrX')
    TargetChrList.append('chrY')

    # output directory
    OutDir = os.path.dirname(os.path.realpath(OutFile))
    if 1:
        print 'OutDir: ', str(OutDir)

    #=========================================
    # loop to process individual chromosomes and corresponding data
    #=========================================
    for chridx in range(len(TargetChrList)):
        curr_chr = TargetChrList[chridx]
        if 1:
            print 'Processing the chromosome: ', str(curr_chr)

        # extract the interactions of current chromosome from the complete set of interactions
        tempchrdumpfile = OutDir + '/Temp_chr_Dump.bed'
        if (headerInp == 1):
            awkcmd = "cat " + str(
                InpFile) + " | awk \'{if (NR>1 && $1==\"" + str(
                    curr_chr) + "\" && $4==\"" + str(
                        curr_chr) + "\"){print $0}}\' -  > " + str(
                            tempchrdumpfile)
        else:
            awkcmd = "cat " + str(InpFile) + " | awk \'{if ($1==\"" + str(
                curr_chr) + "\" && $4==\"" + str(
                    curr_chr) + "\"){print $0}}\' -  > " + str(tempchrdumpfile)
        os.system(awkcmd)

        # check the number of dumped interactions
        num_Int = sum(1 for line in open(tempchrdumpfile))

        if (num_Int == 0):
            if 0:
                print 'Number of interactions for this chromosome = 0 --- continue'
            continue

        if 0:
            print 'Extracted interactions for the current chromosome'

        # # extract also the max span of interactions (6th column maximum element)
        # # so as to estimate the matrix size
        # temp_log_file = OutDir + '/Temp.log'
        # sys_cmd = "cat " + str(tempchrdumpfile) + " | cut -f6 | sort -nr  > " + str(temp_log_file)
        # os.system(sys_cmd)

        # # determine the maximum coordinate
        # with open(temp_log_file, 'r') as fp_in:
        #     l = fp_in.readline()
        #     max_coord = int((l.rstrip()).split()[0])

        # sys.stdout.flush()  # test

        # # number of bins (matrix dimension)
        # nbins = (max_coord / bin_size)
        # if 0:
        #     print 'max_coord of the interactions: ', str(max_coord)
        #     print 'nbins: ', str(nbins)

        # create a graph which will store the interactions
        G = nx.Graph()

        # create a dictionary for storing the interactions
        CurrChrDict = dict()

        # now scan through the interactions of the extracted chromosome
        # and create a dictionary whose keys are the interacting bin numbers
        with open(tempchrdumpfile, 'r') as fp:
            for line in fp:
                linecontents = (line.rstrip()).split()
                # we set the bin number according to the end coordinate
                bin1 = int(linecontents[2]) / bin_size
                bin2 = int(linecontents[5]) / bin_size
                if (bin1 < bin2):
                    curr_key = (bin1, bin2)
                else:
                    curr_key = (bin2, bin1)
                # assign the key to the dictionary
                # add the contact count, P value and Q value information as well
                CurrChrDict.setdefault(
                    curr_key,
                    Interaction(int(linecontents[CCCol - 1]),
                                float(linecontents[PValCol - 1]),
                                float(linecontents[QValCol - 1])))
                # add the node to the given graph as well
                G.add_node(curr_key)
                if 0:
                    print 'Current interaction: ', str(
                        line), '  ------ curr_key: ', curr_key

        # now check the nodes of G
        # assign edges of G according to the 8 / 4 connectivity rule (according to the input parameter)
        nodelist = list(G.nodes())

        for i in range(len(nodelist) - 1):
            node1 = nodelist[i]
            for j in range(i + 1, len(nodelist)):
                node2 = nodelist[j]
                if 0:
                    print 'Checking the edge between node 1: ', node1, '  and node 2: ', node2
                # check if there should be an edge between node1 and node2
                # according to the desired connectivity rule
                if (connectivity_rule == 8):
                    if (abs(node1[0] - node2[0]) <=
                            1) and (abs(node1[1] - node2[1]) <= 1):
                        G.add_edge(node1, node2)
                        if 0:
                            print '8 connectivity Edge between node 1: ', node1, '  and node 2: ', node2
                if (connectivity_rule == 4):
                    if ((abs(node1[0] - node2[0]) + abs(node1[1] - node2[1]))
                            <= 1):
                        G.add_edge(node1, node2)
                        if 0:
                            print '4 connectivity Edge between node 1: ', node1, '  and node 2: ', node2

        # check the edges of G
        edgelist = list(G.edges())

        if 1:
            print 'No of nodes of G: ', G.number_of_nodes()
            print 'No of edges of G: ', G.number_of_edges()
            print 'Number of connected components of G: ', nx.number_connected_components(
                G)

        # scan through individual connected components
        # for each such connected component (list of interactions)
        # we find a representative interaction and print it in the final output file
        list_conn_comp = sorted(nx.connected_components(G),
                                key=len,
                                reverse=True)

        if 0:
            print '\n\n**** Number of connected components: ', len(
                list_conn_comp), '  ****\n\n'

        #====================
        # process individual connected components
        #====================
        for i in range(len(list_conn_comp)):
            # a connected component - a particular list of connected nodes
            curr_comp_list = list(list_conn_comp[i])
            if 0:
                print '\n\n\n  ===>>>>> Processing the connected component no: ', i, '  list: ', str(
                    curr_comp_list), '  number of elements: ', len(
                        curr_comp_list)

            # from the first interacting bin set, get the lower and higher bin index
            min_idx_bin1 = min([x[0] for x in curr_comp_list])
            max_idx_bin1 = max([x[0] for x in curr_comp_list])
            if 0:
                print 'min_idx_bin1: ', min_idx_bin1, ' max_idx_bin1: ', max_idx_bin1

            # get the span of coordinates for the first interacting bin (set)
            span_low_bin1 = (min_idx_bin1 - 1) * bin_size
            span_high_bin1 = max_idx_bin1 * bin_size
            if 0:
                print 'span_low_bin1: ', span_low_bin1, ' span_high_bin1: ', span_high_bin1

            # from the second interacting bin set, get the lower and higher bin index
            min_idx_bin2 = min([x[1] for x in curr_comp_list])
            max_idx_bin2 = max([x[1] for x in curr_comp_list])
            if 0:
                print 'min_idx_bin2: ', min_idx_bin2, ' max_idx_bin2: ', max_idx_bin2

            # get the span of coordinates for the first interacting bin (set)
            span_low_bin2 = (min_idx_bin2 - 1) * bin_size
            span_high_bin2 = max_idx_bin2 * bin_size
            if 0:
                print 'span_low_bin2: ', span_low_bin2, ' span_high_bin2: ', span_high_bin2

            # sum of contact counts for all the interacting bins
            # within this set of connected nodes
            sum_cc = sum([CurrChrDict[x]._GetCC() for x in curr_comp_list])

            # now get the percentage of bin pairs within this set of connected component
            # having a significant interaction
            total_possible_bin_pairs = (max_idx_bin1 - min_idx_bin1 +
                                        1) * (max_idx_bin2 - min_idx_bin2 + 1)
            possible_bin_pairs = 0
            for b1 in range(min_idx_bin1, (max_idx_bin1 + 1)):
                for b2 in range(min_idx_bin2, (max_idx_bin2 + 1)):
                    bin_pair_key = (b1, b2)
                    if bin_pair_key in CurrChrDict:
                        possible_bin_pairs = possible_bin_pairs + 1

            # % of bin pairs within the region spanned by this connected component
            # having significant interaction
            # the higher the %, the better this component is strongly connected
            Percent_Significant_BinPair = (possible_bin_pairs *
                                           1.0) / total_possible_bin_pairs

            if 0:
                print ' ==>>> total_possible_bin_pairs: ', total_possible_bin_pairs, ' possible_bin_pairs: ', possible_bin_pairs, ' % clique: ', Percent_Significant_BinPair

            #==================================================
            # approach 1 :
            # if TopPctElem = 0 then
            # get the bin having maximum statistical significance and corresponding bin pairs
            # ties are resolved by maximum contact count
            # if SortOrder = 0, maximum statistical significance === min P and Q values
            # if SortOrder = 1, maximum statistical significance === max P and Q (equivalent) measures
            #==================================================
            if (TopPctElem == 0):

                for j in range(len(curr_comp_list)):
                    curr_key = curr_comp_list[j]
                    curr_cc = CurrChrDict[curr_key]._GetCC()
                    curr_pval = CurrChrDict[curr_key]._GetPVal()
                    curr_qval = CurrChrDict[curr_key]._GetQVal()

                    curr_key_bin1_mid = (((curr_key[0] - 1) * bin_size) +
                                         (curr_key[0] * bin_size)) / 2
                    curr_key_bin2_mid = (((curr_key[1] - 1) * bin_size) +
                                         (curr_key[1] * bin_size)) / 2
                    if 0:
                        print ' Connected component index: ', j, ' curr_key: ', curr_key, ' bin 1 mid: ', curr_key_bin1_mid, ' bin 2 mid: ', curr_key_bin2_mid, ' CC: ', curr_cc, ' Pval: ', curr_pval, ' Qval: ', curr_qval
                    if (j == 0):
                        # first index
                        rep_bin_key = curr_key
                    elif (SortOrder == 0) and (
                            curr_pval < CurrChrDict[rep_bin_key]._GetPVal()
                    ) and (curr_qval < CurrChrDict[rep_bin_key]._GetQVal()):
                        # current element has higher statistical significance (lower P or Q value when SortOrder = 0)
                        rep_bin_key = curr_key
                    elif (SortOrder == 1) and (
                            curr_pval > CurrChrDict[rep_bin_key]._GetPVal()
                    ) and (curr_qval > CurrChrDict[rep_bin_key]._GetQVal()):
                        # current element has higher statistical significance (higher P or Q value when SortOrder = 1)
                        rep_bin_key = curr_key
                    elif (curr_pval
                          == CurrChrDict[rep_bin_key]._GetPVal()) and (
                              curr_qval == CurrChrDict[rep_bin_key]._GetQVal()
                          ) and (curr_cc > CurrChrDict[rep_bin_key]._GetCC()):
                        # current element has equal P and Q values
                        # but higher contact count
                        rep_bin_key = curr_key

                # fix the representative interaction
                rep_bin1_low = (rep_bin_key[0] - 1) * bin_size
                rep_bin1_high = rep_bin_key[0] * bin_size
                rep_bin2_low = (rep_bin_key[1] - 1) * bin_size
                rep_bin2_high = rep_bin_key[1] * bin_size
                cc = CurrChrDict[rep_bin_key]._GetCC()
                pval = CurrChrDict[rep_bin_key]._GetPVal()
                qval = CurrChrDict[rep_bin_key]._GetQVal()

                if 0:
                    print '**** Selected bin key: ', rep_bin_key, ' start bin mid: ', (
                        rep_bin1_low + rep_bin1_high) / 2, ' end bin mid: ', (
                            rep_bin2_low + rep_bin2_high
                        ) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval

                # write the interaction in the specified output file
                fp_outInt.write('\n' + str(curr_chr) + '\t' +
                                str(rep_bin1_low) + '\t' + str(rep_bin1_high) +
                                '\t' + str(curr_chr) + '\t' +
                                str(rep_bin2_low) + '\t' + str(rep_bin2_high) +
                                '\t' + str(cc) + '\t' + str(pval) + '\t' +
                                str(qval) + '\t' + str(span_low_bin1) + '\t' +
                                str(span_high_bin1) + '\t' +
                                str(span_low_bin2) + '\t' +
                                str(span_high_bin2) + '\t' + str(sum_cc) +
                                '\t' + str(Percent_Significant_BinPair))

            #==================================================
            # approach 2:
            # if TopPctElem > 0, and TopPctElem < 100
            # then get the top (TopPctElem %) elements from
            # each connected component, (P and Q values, contact counts)
            # use these elements if they satisfy the bin neighborhood threshold
            #==================================================
            if (TopPctElem > 0) and (TopPctElem < 100):

                # list to store the q-values and the CC values for individual bin pairs
                # for their sequential extraction
                Curr_Comp_Tuple_List = []

                # lists storing different attributes
                curr_conn_comp_CCList = []
                curr_conn_comp_QValList = []

                # process individual elements within this connected component
                for j in range(len(curr_comp_list)):
                    curr_key = curr_comp_list[j]
                    curr_cc = CurrChrDict[curr_key]._GetCC()
                    curr_pval = CurrChrDict[curr_key]._GetPVal()
                    curr_qval = CurrChrDict[curr_key]._GetQVal()
                    curr_conn_comp_CCList.append(curr_cc)
                    curr_conn_comp_QValList.append(curr_qval)

                    # create a min-heap structure
                    # first element: q-value
                    # if SortOrder = 0, lower: better: use the same sign when insering in the heap
                    # if SortOrder = 1, higher: better: reverse the sign when insering in the heap
                    # for ties, second element (contact count) - higher: better
                    # so, use negative signs
                    if (SortOrder == 0):
                        subl = [
                            curr_qval, ((-1) * curr_cc), curr_key[0],
                            curr_key[1]
                        ]
                    else:
                        subl = [((-1) * curr_qval), ((-1) * curr_cc),
                                curr_key[0], curr_key[1]]

                    # insert the element in the designated queue
                    heapq.heappush(Curr_Comp_Tuple_List, subl)

                # first get the maximum / minimum values from these lists
                max_cc = max(curr_conn_comp_CCList)
                min_qval = min(curr_conn_comp_QValList)

                # now obtain the values of top K % elements
                # from these lists
                # where K = 50 means it is median
                custom_cc = custom_percent(curr_conn_comp_CCList, TopPctElem,
                                           2)
                custom_qval = custom_percent(curr_conn_comp_QValList,
                                             TopPctElem, (SortOrder + 1))

                if 0:
                    print ' --> current connected component: max CC: ', max_cc, '  min Q val: ', min_qval, '  top K (TopPctElem): ', TopPctElem, '  custom_cc threshold: ', custom_cc, '  custom_qval threshold: ', custom_qval

                # this list stores the candidate interactions
                # from this particular connected component
                # that will be used in the final set of interactions
                Final_Rep_Key_List = []

                # now extract elements from the constructed queue
                while (len(Curr_Comp_Tuple_List) > 0):
                    curr_elem = heapq.heappop(Curr_Comp_Tuple_List)
                    if 0:
                        print 'extracted element from heap: ', curr_elem

                    #===================================
                    # earlier condition - 1 - sourya
                    # consider only those interactions
                    # which have sufficient values of both contact count
                    # and q-values

                    # # terminating condition - do not consider elements
                    # # with lower log 10 Q values than the custom_logqval
                    # if ((curr_elem[0] * (-1)) < custom_logqval):
                    #     break

                    # # coninue if the contact count falls below the designated threshold
                    # if ((curr_elem[1] * (-1)) < custom_cc):
                    #     continue
                    #===================================
                    # modified condition - sourya
                    # consider those interactions having
                    # significance value > K percentile
                    if ((SortOrder == 0) and (curr_elem[0] > custom_qval)) or (
                        (SortOrder == 1) and (curr_elem[0] < custom_qval)):
                        break
                    #===================================

                    # if this is the first element
                    # then insert they key in the candidate set of interactions
                    if (len(Final_Rep_Key_List) == 0):
                        subl = [curr_elem[2], curr_elem[3]]
                        Final_Rep_Key_List.append(subl)
                        if 0:
                            print '\t\t *** inserted element in the final list: ', str(
                                subl), '  generated Final_Rep_Key_List: ', str(
                                    Final_Rep_Key_List)
                        continue

                    # otherwise, check with the existing interactions
                    # and do not include if the bin falls within a certain
                    # neighborhood of earlier included interactions
                    # the neighborhood is already mentioned via command line parameters
                    flag = False
                    for i in range(len(Final_Rep_Key_List)):
                        # both ends of the bins should be within neighborhood thresholds
                        # of existing contacts
                        if (((abs(Final_Rep_Key_List[i][0] - curr_elem[2])) *
                             bin_size) <= NeighborHoodBinThr) and ((
                                 (abs(Final_Rep_Key_List[i][1] - curr_elem[3]))
                                 * bin_size) <= NeighborHoodBinThr):
                            flag = True
                            if 0:
                                print ' --- current element is within neighborhood of the bins indexed by ', i, '  of Final_Rep_Key_List'
                            break

                    if (flag == False):
                        # there is no such neighborhood constraints
                        # include the bin
                        subl = [curr_elem[2], curr_elem[3]]
                        Final_Rep_Key_List.append(subl)
                        if 0:
                            print '\t\t *** inserted element in the final list: ', str(
                                subl), '  generated Final_Rep_Key_List: ', str(
                                    Final_Rep_Key_List)

                # now print the candidate interactions
                # of the current component
                for i in range(len(Final_Rep_Key_List)):
                    rep_bin_key = (Final_Rep_Key_List[i][0],
                                   Final_Rep_Key_List[i][1])

                    # fix the representative interaction
                    rep_bin1_low = (rep_bin_key[0] - 1) * bin_size
                    rep_bin1_high = rep_bin_key[0] * bin_size
                    rep_bin2_low = (rep_bin_key[1] - 1) * bin_size
                    rep_bin2_high = rep_bin_key[1] * bin_size
                    cc = CurrChrDict[rep_bin_key]._GetCC()
                    pval = CurrChrDict[rep_bin_key]._GetPVal()
                    qval = CurrChrDict[rep_bin_key]._GetQVal()

                    if 1:
                        print '**** Selected bin key: ', rep_bin_key, ' start bin mid: ', (
                            rep_bin1_low + rep_bin1_high
                        ) / 2, ' end bin mid: ', (
                            rep_bin2_low + rep_bin2_high
                        ) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval

                    # write the interaction in the specified output file
                    fp_outInt.write('\n' + str(curr_chr) + '\t' +
                                    str(rep_bin1_low) + '\t' +
                                    str(rep_bin1_high) + '\t' + str(curr_chr) +
                                    '\t' + str(rep_bin2_low) + '\t' +
                                    str(rep_bin2_high) + '\t' + str(cc) +
                                    '\t' + str(pval) + '\t' + str(qval) +
                                    '\t' + str(span_low_bin1) + '\t' +
                                    str(span_high_bin1) + '\t' +
                                    str(span_low_bin2) + '\t' +
                                    str(span_high_bin2) + '\t' + str(sum_cc) +
                                    '\t' + str(Percent_Significant_BinPair))

            #==================================================
            # approach 3:
            # if TopPctElem = 100 (latest implementation)
            # then sequentially obtain the interactions with the lowest q-value
            # and break the tie for the higher contact count
            # use these elements if they satisfy the bin neighborhood threshold
            #==================================================
            if (TopPctElem == 100):

                # list to store the q-values and the CC values for individual bin pairs
                # for their sequential extraction
                Curr_Comp_Tuple_List = []

                # lists storing different attributes
                curr_conn_comp_CCList = []
                curr_conn_comp_QValList = []

                # process individual elements within this connected component
                for j in range(len(curr_comp_list)):
                    curr_key = curr_comp_list[j]
                    curr_cc = CurrChrDict[curr_key]._GetCC()
                    curr_qval = CurrChrDict[curr_key]._GetQVal()
                    curr_conn_comp_CCList.append(curr_cc)
                    curr_conn_comp_QValList.append(curr_qval)

                    # create a min-heap structure
                    # first element: q-value
                    # if SortOrder = 0, lower: better: use the same sign when insering in the heap
                    # if SortOrder = 1, higher: better: reverse the sign when insering in the heap
                    # for ties, second element (contact count) - higher: better
                    # so, use negative signs
                    if (SortOrder == 0):
                        subl = [
                            curr_qval, ((-1) * curr_cc), curr_key[0],
                            curr_key[1]
                        ]
                    else:
                        subl = [((-1) * curr_qval), ((-1) * curr_cc),
                                curr_key[0], curr_key[1]]

                    # insert the element in the designated queue (min-heap property)
                    heapq.heappush(Curr_Comp_Tuple_List, subl)

                # this list stores the candidate interactions
                # from this particular connected component
                # that will be used in the final set of interactions
                Final_Rep_Key_List = []

                if 0:
                    print ' **** Processing the connected component ===== number of elements: ', len(
                        Curr_Comp_Tuple_List)

                # now extract elements from the constructed queue
                while (len(Curr_Comp_Tuple_List) > 0):
                    # extract the first element from the min-heap
                    # element with the lowest value
                    curr_elem = heapq.heappop(Curr_Comp_Tuple_List)
                    if 0:
                        print 'extracted element from heap: ', curr_elem

                    # if this is the first element
                    # then insert they key in the candidate set of interactions
                    if (len(Final_Rep_Key_List) == 0):
                        subl = [curr_elem[2], curr_elem[3]]
                        Final_Rep_Key_List.append(subl)
                        if 0:
                            print '*** inserted element in the final list: ', str(
                                subl)
                        continue

                    # otherwise, check with the existing interactions
                    # and do not include if the bin falls within a certain
                    # neighborhood of earlier included interactions
                    # the neighborhood is already mentioned via command line parameters
                    flag = False
                    for i in range(len(Final_Rep_Key_List)):
                        # both ends of the bins should be within neighborhood thresholds
                        # of existing contacts
                        if (((abs(Final_Rep_Key_List[i][0] - curr_elem[2])) *
                             bin_size) <= NeighborHoodBinThr) and ((
                                 (abs(Final_Rep_Key_List[i][1] - curr_elem[3]))
                                 * bin_size) <= NeighborHoodBinThr):
                            flag = True
                            if 0:
                                print ' --- current element is within neighborhood of the existing (included) bin ', Final_Rep_Key_List[
                                    i]
                            break

                    if (flag == False):
                        # there is no such neighborhood constraints
                        # include the bin
                        subl = [curr_elem[2], curr_elem[3]]
                        Final_Rep_Key_List.append(subl)
                        if 0:
                            print '*** inserted element in the final list: ', str(
                                subl)

                # now print the candidate interactions
                # of the current component
                if 0:
                    print '\n\n**** Printing selected loops of the connected component ***\n\n'

                for i in range(len(Final_Rep_Key_List)):
                    rep_bin_key = (Final_Rep_Key_List[i][0],
                                   Final_Rep_Key_List[i][1])

                    # fix the representative interaction
                    rep_bin1_low = (rep_bin_key[0] - 1) * bin_size
                    rep_bin1_high = rep_bin_key[0] * bin_size
                    rep_bin2_low = (rep_bin_key[1] - 1) * bin_size
                    rep_bin2_high = rep_bin_key[1] * bin_size
                    cc = CurrChrDict[rep_bin_key]._GetCC()
                    pval = CurrChrDict[rep_bin_key]._GetPVal()
                    qval = CurrChrDict[rep_bin_key]._GetQVal()
                    if 0:
                        print 'Selected bin key: ', rep_bin_key, ' start bin mid: ', (
                            rep_bin1_low + rep_bin1_high
                        ) / 2, ' end bin mid: ', (
                            rep_bin2_low + rep_bin2_high
                        ) / 2, ' cc: ', cc, ' pval: ', pval, ' qval: ', qval

                    # write the interaction in the specified output file
                    fp_outInt.write('\n' + str(curr_chr) + '\t' +
                                    str(rep_bin1_low) + '\t' +
                                    str(rep_bin1_high) + '\t' + str(curr_chr) +
                                    '\t' + str(rep_bin2_low) + '\t' +
                                    str(rep_bin2_high) + '\t' + str(cc) +
                                    '\t' + str(pval) + '\t' + str(qval) +
                                    '\t' + str(span_low_bin1) + '\t' +
                                    str(span_high_bin1) + '\t' +
                                    str(span_low_bin2) + '\t' +
                                    str(span_high_bin2) + '\t' + str(sum_cc) +
                                    '\t' + str(Percent_Significant_BinPair))

            #==================================================

    # after processing all the chromosomes, now close the output interaction file
    fp_outInt.close()

    # # remove the temp log file which stores the max coordinate for a chromosome
    # sys_cmd = "rm " + str(temp_log_file)
    # os.system(sys_cmd)

    # remove the temporary chromosome specific interaction dump file
    sys_cmd = "rm " + str(tempchrdumpfile)
    os.system(sys_cmd)

    if 1:
        print '==================== End of merge filtering adjacent interactions !!! ======================'
 def get(self):
     """ Remove and return the smallest item from the queue
     """
     smallest = heapq.heappop(self.heap)
     del self.set[smallest]
     return smallest
Exemple #10
0
    def aStarSearch(self, distance):
        try:

            path_queue = []
            visited_list = []
            count = float(0)
            base = float(10000000000)
            heapq.heappush(path_queue, (1, self))
            temp, node = heapq.heappop(path_queue)
            visited_list.append(node.state)
            #print(f'This is visited set{visited_list}')
            if self != None:
                while node.checkEnd() == False:
                    #while count < 100:
                    node.move()

                    if node.left != None:
                        if node.left.state not in visited_list:
                            count += 1
                            node.left.updateCosthistory(
                                node.cost_history + node.pathcost(node.left))
                            heapq.heappush(
                                path_queue,
                                (node.left.cost_history +
                                 node.left.heuristics(distance, end_position) +
                                 count / base, node.left))
                        else:
                            #print(f'Visited {node.left.state}')
                            pass

                    if node.right != None:
                        if node.right.state not in visited_list:
                            count += 1
                            node.right.updateCosthistory(
                                node.cost_history + node.pathcost(node.right))
                            heapq.heappush(path_queue, (
                                node.right.cost_history +
                                node.right.heuristics(distance, end_position) +
                                count / base, node.right))
                        else:
                            #print(f'Visited {node.right.state}')
                            pass

                    if node.up != None:
                        if node.up.state not in visited_list:
                            count += 1
                            node.up.updateCosthistory(node.cost_history +
                                                      node.pathcost(node.up))
                            heapq.heappush(
                                path_queue,
                                (node.up.cost_history +
                                 node.up.heuristics(distance, end_position) +
                                 count / base, node.up))

                        else:
                            #print(f'Visited {node.up.state}')
                            pass

                    if node.down != None:
                        if node.down.state not in visited_list:
                            count += 1
                            node.down.updateCosthistory(
                                node.cost_history + node.pathcost(node.down))
                            heapq.heappush(
                                path_queue,
                                (node.down.cost_history +
                                 node.down.heuristics(distance, end_position) +
                                 count / base, node.down))

                        else:
                            #print(f'Visited {node.down.state}')
                            pass

                    temp, node = heapq.heappop(path_queue)
                    visited_list.append(node.state)
                    #if count >100000:
                    #   return None
                    #print (f'layer {count}')
                    #print(f'This is visited set{visited_list}')
                return node
        except:
            return None
Exemple #11
0
    def uniformCostSearch(self):
        try:
            path_queue = []
            visited_list = []
            count = float(0)
            base = float(10000000000)
            heapq.heappush(path_queue, (1, self))
            temp, node = heapq.heappop(path_queue)
            visited_list.append(node.state)
            #print(f'This is visited set{visited_list}')
            if self != None:
                while node.checkEnd() == False:
                    #while count < 100:
                    node.move()

                    if node.left != None:
                        if node.left.state not in visited_list:
                            count += 1
                            heapq.heappush(path_queue,
                                           (node.pathcost(node.left) +
                                            count / base, node.left))
                        else:
                            #print(f'Visited {node.left.state}')
                            pass

                    if node.right != None:
                        if node.right.state not in visited_list:
                            count += 1
                            heapq.heappush(path_queue,
                                           (node.pathcost(node.right) +
                                            count / base, node.right))
                        else:
                            #print(f'Visited {node.right.state}')
                            pass

                    if node.up != None:
                        if node.up.state not in visited_list:
                            count += 1
                            heapq.heappush(path_queue,
                                           (node.pathcost(node.up) +
                                            count / base, node.up))

                        else:
                            #print(f'Visited {node.up.state}')
                            pass

                    if node.down != None:
                        if node.down.state not in visited_list:
                            count += 1
                            heapq.heappush(path_queue,
                                           (node.pathcost(node.down) +
                                            count / base, node.down))

                        else:
                            #print(f'Visited {node.down.state}')
                            pass

                    temp, node = heapq.heappop(path_queue)
                    visited_list.append(node.state)

                    #print (f'layer {count}')
                    #print(f'This is visited set{visited_list}')
                return node
        except:
            return None
Exemple #12
0
 def get(self):
     """ Remove and return the smallest item from the queue
     """
     smallest = heapq.heappop(self.heap)
     del self.set[smallest]
     return smallest
Exemple #13
0
 def findKthLargest(self, nums, k):
     from Queue import heapq
     heapq.heapify(nums)
     sorted_heap = [heapq.heappop(nums) for _ in xrange(len(nums))]
     return sorted_heap[-k]
Exemple #14
0
 def search(self, query, return_length=100, passage_len=50, return_urls_only=False):
     '''
         Performs search on loaded data. 
         Returns list of sorted by rank:
           * tuples (url, rank) if return_urls_only == False
           * url                if return_urls_only == True
     '''
     query = query.strip()
     words = filter(lambda x: x != '', query.split(" "))
     result = None
     if len(words) == 0:
         return []
     word_index = [None] * len(words)
     for z, word in enumerate(words):
         word = self.norm(word.decode('utf-8').strip())
         if word in self.dictionary:
             self.index.seek(self.dictionary[word], 0)
             compressed = self.index.readline().strip()
             decompressed = None
             if self.encoding == VARBYTE:
                 decompressed = decode_varbyte(base64.b64decode(compressed))
             elif self.encoding == SIMPLE9:
                 decompressed = decode_simple9(base64.b64decode(compressed))
             decompressed, word_index[z] = from_flat(decompressed)
             if result == None:
                 result = decompressed
             else:
                 result = join(result, decompressed)
 
     k1 = 2
     b = 0.75
     if result == None or len(result) == 0:
         return []
        
     #Now we have a list of candidates. We apply BM25 to leave only return_length of them
     avg_len = 0.
     j = 0
     while word_index[j] == None:
         j += 1
         
     for i in xrange(len(result)):
         if result[i] in word_index[j]:
             avg_len += word_index[j][result[i]][0]
         
     avg_len /= len(result)
     BM25 = [0] * len(result)
     for j in xrange(len(words)):
         if word_index[j] != None:
             idf = log(float(self.N) / len(word_index[j]))
             for i in xrange(len(result)):
                 if result[i] in word_index[j]:
                     tf = float(len(word_index[j][result[i]][1])) / word_index[j][result[i]][0]
                     BM25[i] += tf * idf / (tf + k1 * (b + word_index[j][result[i]][0] / avg_len * (1 - b)))
     if len(result) > return_length:
         tpr = [(x, y) for x, y in zip(BM25, result)]
         heap = tpr[:return_length]
         heapq.heapify(heap)
         for rank, ind in tpr[return_length:]:
             if heapq.nsmallest(1, heap)[0][0] < rank:
                 heapq.heappop(heap)
                 heapq.heappush(heap, (rank, ind))
         result = [ind for rank, ind in heap]
     #Now we have a shortened list of candidates. We apply passage algorithm to leave top maxPASSpass
     scores = [0] * len(result)
     for i in xrange(len(result)):
         passage = []
         for j in xrange(len(words)):
              if word_index[j] != None and result[i] in word_index[j]:
                     passage.extend([(x, j) for x in word_index[j][result[i]][1]])
         passage.sort()
         l = 0
         r = 0
         features = [0] * 5
         for l in xrange(len(passage)):
             for r in xrange(l, len(passage)):
                 if passage[r][0] - passage[l][0] + 1 > passage_len:
                     continue
                 passage_w = [x[1] for x in passage[l:r+1]]
                 features[0] = len(set([x[1] for x in passage[l:r+1]])) / float(len(words))
                 features[1] = 1 - float(passage[l][0]) / word_index[passage[l][1]][result[i]][0]
                 features[2] = 1 - float(r - l + 1) / (passage[r][0] - passage[l][0] + 1)
                            
                 features[3] = 0
                 for j in xrange(len(words)):
                     if word_index[j] != None:
                         idf = log(float(self.N) / len(word_index[j])) / log(self.N)
                         tf = float(passage_w.count(j)) / (passage[r][0] - passage[l][0] + 1)
                         features[3] += tf * idf
                 features[4] = 0
                 for j in xrange(len(passage_w)-1):
                     for k in xrange(j + 1, len(passage_w)):
                         if passage_w[j] > passage_w[k]:
                             features[4] += 1
                 if len(passage_w) != 1:
                     features[4] /= float(len(passage_w) * (len(passage_w) - 1) / 2)
                     
                 score = reduce(lambda x,y: x + y, features)
                 if score > scores[i]:
                     scores[i] = score
         
     final_result = []
     for score, url_id in sorted(zip(scores, result), reverse=True):
         final_result.append((url_id, self.urls[url_id], score))
     if return_urls_only:
         final_result = [x[:-1] for x in final_result]
     return final_result