コード例 #1
0
ファイル: dtree.py プロジェクト: AsifulNobel/CSE_440
def information_gain(examples, attribute, threshold, class_labels):
    # variable name meaning
    # exc -> total number of examples
    # excc -> total number of columns in first example
    # ex_in_lt -> example indices less than threshold
    # ex_in_ge -> example indices greater than or equal to threshold
    # exc_lt -> total number of elements in ex_in_lt
    # exc_ge -> total number of elements in ex_in_ge
    # cl_freq_node -> node's class label frequency
    # cl_freq_l_node -> class label frequency of examples less than threshold
    # cl_freq_r_node -> class label frequency of examples greater than or equal
    # ----------------> to threshold

    exc = len(examples) * 1.0
    excc = len(examples[0])
    ex_in_lt = [index for index, row in enumerate(examples) if row[attribute] < threshold]
    ex_in_ge = [index for index, row in enumerate(examples) if row[attribute] >= threshold]
    exc_lt = len(ex_in_lt) * 1.0
    exc_ge = len(ex_in_ge) * 1.0

    gain = 0

    cl_freq_node = {}
    cl_freq_l_node = {}
    cl_freq_r_node = {}

    for label in class_labels:
        cl_freq_node[label] = 0
        cl_freq_l_node[label] = 0
        cl_freq_r_node[label] = 0

    for index, row in enumerate(examples):
        cl_freq_node[int(row[excc-1])] += 1

        if index in ex_in_lt:
            cl_freq_l_node[int(row[excc-1])] += 1

        if index in ex_in_ge:
            cl_freq_r_node[int(row[excc-1])] += 1

    for key, val in cl_freq_node.iteritems():
        if val > 0:
            gain += -((val/exc)*logarithm((val/exc), 2))

    if exc_lt > 0:
        for key, val in cl_freq_l_node.iteritems():
            if val > 0:
                gain -= (exc_lt/exc) * -((val/exc_lt)*logarithm((val/exc_lt), 2))

    if exc_ge > 0:
        for key, val in cl_freq_r_node.iteritems():
            if val > 0:
                gain -= (exc_ge/exc) * -((val/exc_ge)*logarithm((val/exc_ge), 2))

    return gain
コード例 #2
0
def compute_my_variability(event_log: Log) -> float:
    """
    Computes the prefix entropy of the input Log
    Args:
        event_log (Log): the input log

    Returns:
        the prefix-block entropy
    """
    prefixes: List[List[Event]] = []
    bar: Bar = IncrementalBar("Prefix generation",
                              max=len(event_log.trace_list))
    for trace in event_log.trace_list:
        trace_prefixes: List[List[Event]] = trace.get_all_prefixes()

        for prefix in trace_prefixes:
            if prefix not in prefixes:
                prefixes.append(prefix)
        bar.next()
    bar.finish()

    entropy: float = 0

    bar = ShadyBar("Prefix likelihood estimation", max=len(prefixes))
    for prefix in prefixes:

        p: float = _prefix_likelihood_estimator(event_log, prefix)
        entropy += p * logarithm(p, 10)

        bar.next()
    bar.finish()

    entropy *= -1

    return entropy
コード例 #3
0
    def add_result(self, cfg, model, result):
        result.model = model
        result.params = cfg.processor.models.get_num_params(model)

        K = float(result.params)
        n = float(len(self.column_set))
        lnL = float(result.lnl)

        # Here we put in a catch for small subsets, where n < K+2.
        # If this happens, the AICc actually starts rewarding very small
        # datasets, which is wrong a simple but crude catch for this is just to
        # never allow n to go below k+2
        result.aic = (-2.0 * lnL) + (2.0 * K)
        result.bic = (-2.0 * lnL) + (K * logarithm(n))

        if n < (K + 2):
            log.warning(self.SMALL_WARNING % (self, n, model, K, self.name))
            n = K + 2

        result.aicc = (-2.0 * lnL) + ((2.0 * K) * (n / (n - K - 1.0)))

        # This is the rate per site of the model - used in some clustering
        # analyses
        result.site_rate = float(result.tree_size)

        log.debug("Adding model to subset. Model: %s, params %d, site_rate %f"
                  % (model, K, result.site_rate))

        if model in self.results:
            log.error("Can't add model result %s, it already exists in %s",
                      model, self)
        self.results[model] = result
コード例 #4
0
    def add_result(self, cfg, model, result):
        result.model = model
        result.params = cfg.processor.models.get_num_params(model)

        K = float(result.params)
        n = float(len(self.columnset))
        lnL = float(result.lnl)
        #here we put in a catch for small subsets, where n<K+2
        #if this happens, the AICc actually starts rewarding very small datasets, which is wrong
        #a simple but crude catch for this is just to never allow n to go below k+2
        if n < (K + 2):
            log.warning("The subset containing the following data_blocks: %s, has a very small"
                        " number of sites (%d) compared to the number of parameters"
                        " in the model being estimated (the %s model which has %d parameters)."
                        " This may give misleading AICc results, so please check carefully"
                        " if you are using the AICc for your analyses."
                        " The model selection results for this subset are in the following file:"
                        " /analysis/subsets/%s.txt\n" % (self, n, model, K, self.name))
            n = K + 2

        result.aic = (-2.0 * lnL) + (2.0 * K)
        result.bic = (-2.0 * lnL) + (K * logarithm(n))
        result.aicc = (-2.0 * lnL) + ((2.0 * K) * (n / (n - K - 1.0)))

        #this is the rate per site of the model - used in some clustering analyses
        result.site_rate = float(result.tree_size)

        log.debug("Adding model to subset. Model: %s, params %d, site_rate %f" % (model, K, result.site_rate))

        if model in self.results:
            log.error("Can't add model result %s, it already exists in %s",
                      model, self)
        self.results[model] = result
コード例 #5
0
ファイル: phyml.py プロジェクト: wrightaprilm/partitionfinder
def get_CIs(cfg):
    ci_list = []
    fname = os.path.join(cfg.base_path, 'rates.txt')
    the_cis = open(fname)
    for ci in the_cis.readlines():
        ci_list.append([logarithm(float(ci))])
    return ci_list
コード例 #6
0
    def __init__(self, sch, nseq, branchlengths, model_selection):
        self.scheme_name = sch.name
        self.scheme = sch
        self.model_selection = model_selection

        # Calculate AIC, BIC, AICc for each scheme.
        # How you do this depends on whether brlens are linked or not.
        self.nsubs = len(sch.subsets)  # number of subsets
        sum_subset_k = sum([s.best_params for s in sch])  # sum of number of parameters in the best model of each subset

        log.debug("Calculating number of parameters in scheme:")
        log.debug("Total parameters from subset models: %d" % (sum_subset_k))

        if branchlengths == 'linked':  # linked brlens - only one extra parameter per subset
            self.sum_k = sum_subset_k + (self.nsubs - 1) + (
                (2 * nseq) - 3)  # number of parameters in a scheme
            log.debug("Total parameters from brlens: %d" % ((2 * nseq) - 3))
            log.debug(
                "Parameters from subset multipliers: %d" % (self.nsubs - 1))

        elif branchlengths == 'unlinked':  # unlinked brlens - every subset has its own set of brlens
            self.sum_k = sum_subset_k + (self.nsubs * (
                (2 * nseq) - 3))  # number of parameters in a scheme
            log.debug("Total parameters from brlens: %d" % ((
                2 * nseq) - 3) * self.nsubs)

        else:
            # WTF?
            log.error("Unknown option for branchlengths: %s", branchlengths)
            raise AnalysisError

        log.debug("Grand total parameters: %d" % (self.sum_k))

        self.lnl = sum([s.best_lnl for s in sch])
        self.nsites = sum([len(s.columnset) for s in sch])

        K = float(self.sum_k)
        n = float(self.nsites)
        lnL = float(self.lnl)

        log.debug("n: %d\tK: %d" % (n, K))

        #here we put in a catch for small subsets, where n<K+2
        #if this happens, the AICc actually starts rewarding very small datasets, which is wrong
        #a simple but crude catch for this is just to never allow n to go below k+2
        if n < (K + 2):
            log.warning("Scheme '%s' has a very small"
                        " number of sites (%d) compared to the number of parameters"
                        " in the models that make up the subsets"
                        " This may give misleading AICc results, so please check carefully"
                        " if you are using the AICc for your analyses."
                        " The results for this scheme are in the following file:"
                        " /analysis/schemes/%s.txt\n" % (sch.name, n, sch.name))
            n = K + 2

        self.aic = (-2.0 * lnL) + (2.0 * K)
        self.bic = (-2.0 * lnL) + (K * logarithm(n))
        self.aicc = (-2.0 * lnL) + ((2.0 * K) * (n / (n - K - 1.0)))
コード例 #7
0
    def __init__(self, sch, nseq, branchlengths, model_selection):
        self.scheme_name = sch.name
        self.scheme = sch
        self.model_selection = model_selection

        # Calculate AIC, BIC, AICc for each scheme.
        # How you do this depends on whether brlens are linked or not.
        self.nsubs = len(sch.subsets)  # number of subsets
        sum_subset_k = sum([s.best_params for s in sch])  # sum of number of parameters in the best model of each subset

        log.debug("Calculating number of parameters in scheme:")
        log.debug("Total parameters from subset models: %d" % (sum_subset_k))

        if branchlengths == 'linked':  # linked brlens - only one extra parameter per subset
            self.sum_k = sum_subset_k + (self.nsubs - 1) + (
                (2 * nseq) - 3)  # number of parameters in a scheme
            log.debug("Total parameters from brlens: %d" % ((2 * nseq) - 3))
            log.debug(
                "Parameters from subset multipliers: %d" % (self.nsubs - 1))

        elif branchlengths == 'unlinked':  # unlinked brlens - every subset has its own set of brlens
            self.sum_k = sum_subset_k + (self.nsubs * (
                (2 * nseq) - 3))  # number of parameters in a scheme
            log.debug("Total parameters from brlens: %d" % ((
                2 * nseq) - 3) * self.nsubs)

        else:
            # WTF?
            log.error("Unknown option for branchlengths: %s", branchlengths)
            raise AnalysisError

        log.debug("Grand total parameters: %d" % (self.sum_k))

        self.lnl = sum([s.best_lnl for s in sch])
        self.nsites = sum([len(s.columnset) for s in sch])

        K = float(self.sum_k)
        n = float(self.nsites)
        lnL = float(self.lnl)

        log.debug("n: %d\tK: %d" % (n, K))

        #here we put in a catch for small subsets, where n<K+2
        #if this happens, the AICc actually starts rewarding very small datasets, which is wrong
        #a simple but crude catch for this is just to never allow n to go below k+2
        self.aic = (-2.0 * lnL) + (2.0 * K)
        self.bic = (-2.0 * lnL) + (K * logarithm(n))

        if n < (K + 2):
            log.warning("Scheme '%s' has a very small"
                        " number of sites (%d) compared to the number of parameters"
                        " in the models that make up the subsets"
                        " This may give misleading AICc results, so please check carefully"
                        " if you are using the AICc for your analyses." % (sch.name, n,))
            n = K + 2

        self.aicc = (-2.0 * lnL) + ((2.0 * K) * (n / (n - K - 1.0)))
コード例 #8
0
ファイル: providers.py プロジェクト: gaebor/waterfall
def convert_size(size_bytes, table, base):
    if len(table) == 0 or base <= 0:
        raise ValueError("ERROR in convert_size")
    if size_bytes == 0:
        return "0" + table[0]
    i = min(int(logarithm(size_bytes, base)), len(table) - 1)
    p = base ** i
    s = round(size_bytes / p, 2)
    return "{:g}{:s}".format(s, table[i])
コード例 #9
0
ファイル: test_information.py プロジェクト: brettc/bricolage
def _mutual_info(joint):
    assert numpy.isclose(joint.sum(), 1.0)

    info = 0.0
    rownum, colnum = joint.shape
    colsum = joint.sum(axis=0)
    rowsum = joint.sum(axis=1)
    for row in range(rownum):
        for col in range(colnum):
            p_xy = joint[row, col]
            p_x = rowsum[row]
            p_y = colsum[col]
            if p_xy != 0:
                info += p_xy * logarithm(p_xy / (p_x * p_y), 2)
    return info
コード例 #10
0
async def generate_random_url_alias(_urandom=urandom,
                                    _encode=b32encode,
                                    _randint=randint,
                                    _factor=logarithm(256, 32)):
    # The length of URL alias is 8-12 characters
    count = _randint(8, 12)

    # count + 1 / _factor gives us the number of bytes needed
    # to produce *at least* count encoded characters
    random_str = (_encode(_urandom(int(
        (count + 1) / _factor)))[:count].decode("ascii").lower())

    while await Url.query.where(Url.url_alias == random_str).gino.first():
        random_str = (_encode(_urandom(int(
            (count + 1) / _factor)))[:count].decode("ascii").lower())
    return random_str
コード例 #11
0
ファイル: C45.py プロジェクト: florian/decision-trees
    def _entropy(self, X):
        """
        Calculates the Shannon entropy on the given data X

        Arguments:
            X: An iterable for feature values. Usually, this is now a 1D list
        """

        summed = 0
        counter = Counter(X)

        for value in counter:
            count = counter[value]
            px = count / float(len(X))
            summed += px * logarithm(1. / px, 2)

        return summed
コード例 #12
0
ファイル: providers.py プロジェクト: gaebor/waterfall
    def gpu():
        nvmlInit()
        num_devices = nvmlDeviceGetCount()
        if num_devices > 0:
            padding = int(logarithm(num_devices, 10)) + 1
        for i in range(num_devices):
            handle = nvmlDeviceGetHandleByIndex(i)
            name = nvmlDeviceGetName(handle).decode("ascii")
            total_memory = nvmlDeviceGetMemoryInfo(handle).total
            utilization = nvmlDeviceGetUtilizationRates(handle)
            gpu_percent, memory_percent = utilization.gpu, utilization.memory

            yield (f'gpu{i:0{padding}d}', gpu_percent, 0, f'{name} ({i})')
            yield (
                f'gpu{i:0{padding}d} memory',
                memory_percent,
                0,
                f'{convert_size_2(memory_percent * total_memory / 100)}B / {convert_size_2(total_memory)}B',
            )
        nvmlShutdown()
コード例 #13
0
ファイル: subset.py プロジェクト: unagichin/partitionfinder
    def add_result(self, cfg, model, result):
        result.model = model
        result.params = cfg.processor.models.get_num_params(model)

        K = float(result.params)
        n = float(len(self.columnset))
        lnL = float(result.lnl)
        #here we put in a catch for small subsets, where n<K+2
        #if this happens, the AICc actually starts rewarding very small datasets, which is wrong
        #a simple but crude catch for this is just to never allow n to go below k+2
        result.aic = (-2.0 * lnL) + (2.0 * K)
        result.bic = (-2.0 * lnL) + (K * logarithm(n))

        if n < (K + 2):
            log.warning(
                "The subset containing the following data_blocks: %s, has a very small"
                " number of sites (%d) compared to the number of parameters"
                " in the model being estimated (the %s model which has %d parameters)."
                " This may give misleading AICc results, so please check carefully"
                " if you are using the AICc for your analyses."
                " The model selection results for this subset are in the following file:"
                " /analysis/subsets/%s.txt\n" % (self, n, model, K, self.name))
            n = K + 2

        result.aicc = (-2.0 * lnL) + ((2.0 * K) * (n / (n - K - 1.0)))

        #this is the rate per site of the model - used in some clustering analyses
        result.site_rate = float(result.tree_size)

        log.debug(
            "Adding model to subset. Model: %s, params %d, site_rate %f" %
            (model, K, result.site_rate))

        if model in self.results:
            log.error("Can't add model result %s, it already exists in %s",
                      model, self)
        self.results[model] = result
コード例 #14
0
ファイル: providers.py プロジェクト: gaebor/waterfall
def cpu():
    cpu_times = psutil.cpu_times_percent(percpu=True)
    frequencies = psutil.cpu_freq(percpu=True)

    padding = int(logarithm(len(cpu_times), 10)) + 1

    cpu_infos = [
        (f'cpu{i:0{padding}d}', cpu_time.user, cpu_time.system)
        for i, cpu_time in enumerate(cpu_times)
    ]
    cpu_infos.append(
        (
            'cpu',
            sum(x[1] for x in cpu_infos) / len(cpu_infos),
            sum(x[2] for x in cpu_infos) / len(cpu_infos),
        )
    )

    if len(frequencies) == 1:
        cpu_infos[-1] += (str(frequencies[0].current) + 'MHz',)
    elif len(frequencies) == len(cpu_infos) - 1:
        for i in range(len(frequencies)):
            cpu_infos[i] += (str(frequencies[i].current) + 'MHz',)
    return cpu_infos
コード例 #15
0
 def price_request(self, requested_fib):
     return int(floor(logarithm(requested_fib, 10))) + 1
コード例 #16
0
 def price_request(self, requested_fib):
     return int(floor(logarithm(requested_fib,10))) + 1
def update_modbus_registers(args):
    log.debug("Updated thread started.")
    update_interval_seconds = 5
    heartbeat_counter = 1
    heartbeat_counter_max_value = 10
    register_type = 4
    register_offset = 0
    while (CONTINUE_UPDATING_MODBUS_REGISTERS is True):
        log.debug("Updating the server registers")
        simulated_modbus_server_context = args[0]
        # Initialize the number of discovered bluetooth devices to 0
        number_of_nearby_bluetooth_devices = 0
        # Initialize the temperature to a simulated random value
        temperature = int((110 + 4 * logarithm(100 * random())) * 100)
        # Read the board temperature
        try:
            temperature = int((float(
                popen("vcgencmd measure_temp").readline().replace(
                    "temp=", "").replace("'C", "")) * 9 / 5 + 32) * 100)
        except Exception as ex:
            # Log any error, if it occurs
            log.debug("Error reading temperature: " + str(ex))
            log.debug(
                "Simulated temperature data will ge generated instead of a real value"
            )
        # Scan for nearby devices
        if (BLUETOOTH_DEVICE_SCANNING_ENABLED):
            try:
                # Save the results to a file
                popen(
                    "sudo timeout -s SIGINT 1s hcitool -i hci0 lescan --passive > bluetoothScanResults.txt"
                )
                # Open the file and count the lines, and save the line count as the number of devices (omit the header line)
                number_of_nearby_bluetooth_devices = len(
                    open("bluetoothScanResults.txt").readlines()) - 1
                if (number_of_nearby_bluetooth_devices == -1):
                    raise Exception(
                        "Possible popen error", "len(open("
                        "bluetoothScanResults.txt"
                        ").readlines()) equals zero")
            except Exception as ex:
                # Log any error, if it occurs
                log.debug("Error scanning for bluetooth devices: " + str(ex))
                log.debug(
                    "Default value of 0 will be used instead of a real value")
        # Write the new values back to the Modbus register
        new_register_values = [
            temperature, number_of_nearby_bluetooth_devices, heartbeat_counter
        ]
        log.debug("New values: " + str(new_register_values))
        simulated_modbus_server_context.setValues(register_type,
                                                  register_offset,
                                                  new_register_values)
        # Increment the hearbeat counter by one
        heartbeat_counter = heartbeat_counter + 1
        # Reset the counter if necessary
        if (heartbeat_counter > heartbeat_counter_max_value):
            heartbeat_counter = 1
        # Wait until the next loop
        sleep(update_interval_seconds)
    # Once broken out of the loop, note that the thread is over
    log.debug("Updated thread ended.")
コード例 #18
0
def get_bic(lnL, K, n):
    bic = (-2.0 * lnL) + (K * logarithm(n))
    return bic
コード例 #19
0
ファイル: classifier.py プロジェクト: payoj21/Eattreat
def naivebayes(fileid, string_tags, dictionary, global_var):
    import sys
    import os
    import codecs
    import json
    pathname = "eattreat_nlp_taggenerator/"
    path = [
        'Bakery&Sweets/', 'Snacks/', 'Meats/', 'Organics/', 'Other/',
        'Drinks/', 'Restaurants/'
    ]
    path1 = [
        'Bakery&Sweets', 'Snacks', 'Meats', 'Organics', 'Other', 'Drinks',
        'Restaurants'
    ]
    dict_bakery = ['bakery', 'cake', 'chocolate', 'dessert', 'sweet']
    dict_snacks = ['cafe', 'street', 'chaat', 'food', 'snack', 'golgappe']
    dict_organics = ['healthy', 'detox', 'vegan', 'salad', 'diet', 'dietary']
    dict_restaurants = [
        'restaurant', 'bar', 'new', 'market', 'menu', 'eatery', 'kitchen',
        'hotel', 'cafe'
    ]
    dict_meats = [
        'chicken', 'biryani', 'seafood', 'prawn', 'fish', 'salmon', 'mutton',
        'meat'
    ]
    dict_drinks = [
        'rum', 'cocktail', 'mocktail', 'drink', 'beer', 'wine', 'drinking',
        'tea', 'coffee', 'whisky', 'whiskey'
    ]
    dict_others = ['festival', 'fest', 'travel']

    dict_top_keywords = {
        'Bakery&Sweets': dict_bakery,
        'Snacks': dict_snacks,
        'Meats': dict_meats,
        'Organics': dict_organics,
        'Other': dict_others,
        'Drinks': dict_drinks,
        'Restaurants': dict_restaurants
    }
    vocab = [{}, {}, {}, {}, {}, {}, {}]
    V = []
    alltags = set()
    classoccur = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    for p in range(len(path)):

        for filename in os.listdir(pathname + path[p]):
            if not filename.startswith('.'):
                classoccur[p] += 1
                inputfile = codecs.open(pathname + path[p] + filename, 'r')
                for line in inputfile:
                    content = line.split("\t")
                    post_id = content[0]
                    post_title = content[1]
                    post_tags = content[2]
                    tags = post_tags.split(', ')
                    terms_freq = dictionary[post_id]
                    #          print terms_freq
                    for t in tags:
                        if t != '':
                            t_freq = terms_freq[t]
                            #              hyp_count = t.count('-')
                            tt = t.split('-')
                            t_score = 0
                            for ttt in tt:
                                if ttt not in alltags:
                                    alltags.add(ttt)
#                 if ttt in dict_top_keywords[path1[p]]:
#                   t_score = t_score + (hyp_count+1)
                                if ttt not in vocab[p]:
                                    vocab[p].update({ttt: 1})
                                else:
                                    vocab[p][ttt] += 1

        V.append(sum(vocab[p].values()))

    naive = [{}, {}, {}, {}, {}, {}, {}]
    lenalltags = len(alltags)

    #---------------------DICTIONARIES------------------------------------------
    if global_var < 2:
        for alpha in range(len(vocab)):
            class_dict = open('dictionaries/' + path1[alpha] + '.txt', 'a')
            for key in vocab[alpha]:
                class_dict.write(
                    str(key) + '\t' + str(vocab[alpha][key]) + '\n')

#---------------------------------------------------------------------------

    test_tags = []

    total_tags = string_tags.split(", ")
    frequency = dictionary[fileid]
    sum1 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    for k in range(len(vocab)):
        s = 0

        for e in total_tags:
            fterm_freq = frequency[e]
            hyphen_count = e.count('-')
            ee = e.split('-')
            e_score = 1
            for eee in ee:
                if eee not in alltags:
                    alltags.add(eee)
                    lenalltags += 1
                if eee in dict_top_keywords[path1[k]]:
                    e_score = (hyphen_count + 1)
                if eee not in vocab[k]:
                    vocab[k].update({eee: 1})
                    V[k] += 1
                else:
                    vocab[k][eee] += 1
                    V[k] += 1
                log_prior = math.lo
                naive[k].update({
                    eee:
                    e_score *
                    float(float(1 + vocab[k][eee]) / float(lenalltags + V[k]))
                })
                s = s + math.logarithm(naive[k][eee])

        beta = float(s + math.logarithm(classoccur[k] / sum(classoccur)))
        sum1[k] = beta

    inputfile.close()

    max_value = max(sum1)
    max_index = sum1.index(max_value)

    classoccur[max_index] += 1

    print path1[max_index]
    return path1[max_index]


#naivebayes()
コード例 #20
0
def log(num, base=10):
	if num > 0:
		return logarithm(num, base)

	return logarithm(sys.float_info.min * sys.float_info.epsilon, base)
コード例 #21
0
def moredigits(a, b):
    loga, logb = logarithm(a, 10), logarithm(b, 10)
    loga, logb = floor(loga), floor(logb)
    return loga > logb
コード例 #22
0
ファイル: util.py プロジェクト: marekborowiec/partitionfinder
def get_bic(lnL, K, n):
    bic = (-2.0 * lnL) + (K * logarithm(n))
    return bic
コード例 #23
0
ファイル: phyml.py プロジェクト: wrightaprilm/partitionfinder
def likelihood_parser(phyml_lk_file):
    '''
    Takes a *_phyml_lk.txt file and returns a dictionary of sites and site
    likelihoods and a dictionary of sites and lists of likelihoods under
    different rate categories. If no rate categories are specified, it will
    return a dictionary with sites and likelihoods P(D|M) for each site.

    Here is an example of the first few lines of the file that it takes:

    Note : P(D|M) is the probability of site D given the model M (i.e., the
    site likelihood) P(D|M,rr[x]) is the probability of site D given the model
    M and the relative rate of evolution rr[x], where x is the class of rate to
    be considered.  We have P(D|M) = \sum_x P(x) x P(D|M,rr[x]).

    Site   P(D|M)          P(D|M,rr[1]=2.6534)   P(D|M,rr[2]=0.2289)   P(D|M,rr[3]=0.4957)   P(D|M,rr[4]=1.0697)   Posterior mean
    1      2.07027e-12     1.3895e-19            6.2676e-12            1.2534e-12            1.21786e-15           0.273422
    2      1.8652e-07      2.05811e-19           6.73481e-07           4.14575e-09           7.97623e-14           0.23049
    3      4.48873e-15     1.37274e-19           7.11221e-15           9.11826e-15           9.21848e-17           0.382265
    4      3.38958e-10     1.31413e-19           1.18939e-09           4.20659e-11           5.86537e-15           0.237972
    5      8.29969e-17     1.11587e-19           3.1672e-17            2.52183e-16           1.9722e-17            0.502077
    6      9.24579e-09     1.59891e-19           3.31101e-08           4.79946e-10           2.59524e-14           0.232669
    7      3.43996e-10     2.1917e-19            1.19544e-09           5.43128e-11           1.22969e-14           0.240455
    8      4.43262e-13     1.1447e-19            1.32148e-12           2.8874e-13            3.7386e-16            0.27685
    9      3.42513e-11     1.70149e-19           1.14227e-10           1.02103e-11           4.05239e-15           0.250765
    10     1.15506e-11     1.28107e-19           3.86378e-11           3.32642e-12           1.46151e-15           0.250024
    '''
    try:
        with open(str(phyml_lk_file)) as phyml_lk_file:
            # The phyml_lk files differ based on whether different rate
            # categories are estimated or not, this figures out which
            # file we are dealing with
            phyml_lk_file.next()
            line2 = phyml_lk_file.next()
            # Check to see if the file contains rate categories
            if line2[0] != "P":
                phyml_lk_file.next()

            # If it contains rate categories, we need to skip a few more lines
            else:
                for _ in xrange(4):
                    phyml_lk_file.next()
            # Read in the contents of the file and get rid of whitespace
            list_of_dicts = list(csv.DictReader(phyml_lk_file,
                delimiter = " ", skipinitialspace = True))
    except IOError:
        raise IOError("Could not find the likelihood file!")
    phyml_lk_file.close()

    # Right now, when the alignment is over 1,000,000 sites, PhyML
    # merges the site number with the site likelihood, catch that and
    # throw an error
    if len(list_of_dicts) > 999999:
        raise IOError("PhyML file cannot process more than 1 M sites")

    # The headers values change with each run so we need a list of them
    headers = []
    for k in list_of_dicts[0]:
        headers.append(k)
    # Sort the headers into alphabetical order
    headers.sort()

    # Check if the rate cateogories were estimated, if they weren't
    # just return the likelihood scores for each site, otherwise, return
    # site likelihoods and likelihoods under each rate category
    if len(headers) < 4:
        # Make a list of site log likelihoods
        likelihood_list = [[logarithm(float(site[headers[1]]))] for site in list_of_dicts]
        return likelihood_list

    else:
        # Make a list of site log likelihoods
        if list_of_dicts[0][headers[1]] == 'nan' or list_of_dicts[0][headers[1]] == 'inf':
            likelihood_list = None
            print "Whoopsies!"
            rate_list = None
            lk_rate_list = None
            lk_site_rate_list = None
            return likelihood_list, lk_rate_list, rate_list, lk_site_rate_list

        else:
            likelihood_list = [[logarithm(float(site[headers[1]]))] for site in list_of_dicts]

            # Make a rate list
            # print list_of_dicts[0][headers[len(headers) - 3]]
            # if list_of_dicts[0][headers[len(headers) - 3]] == 'nan' or list_of_dicts[0][headers[len(headers) - 3]] == 'inf':
            #     rate_list = None
            #     print "Whoopsies!"
            # else:
            rate_list = [[(logarithm(float(site[headers[len(headers) - 3]])))] for site in list_of_dicts]

            # Now make a list of lists of site likelihoods under different
            # rate categories
            lk_rate_list = []
            for i in list_of_dicts:
                ind_lk_list = []
                # Pull the likelihood from each rate category by calling the
                # appropriate key from "headers"
                for num in range(2, len(headers) - 3):
                    ind_lk_list.append(logarithm(float(i[headers[num]])))
                # Now add the list of likelihoods for the site to a master list
                lk_rate_list.append(ind_lk_list)

            # Now pull likelihoods and rates for a two dimensional list
            lk_site_rate_list = []
            for i in list_of_dicts:
                ind_lk_r_list = []
                ind_lk_r_list.append(logarithm(float(i[headers[1]])))
                ind_lk_r_list.append(logarithm(float(i[headers[len(headers) - 3]])))
                lk_site_rate_list.append(ind_lk_r_list)
            # Return both the list of site likelihoods and the list of lists of
            # likelihoods under different rate categories
            return likelihood_list, lk_rate_list, rate_list, lk_site_rate_list