Ejemplo n.º 1
0
def main():
    path = Path('../data/partial')
    train_X, train_X_str, train_Y, train_Y_str, test_X, test_X_str, test_Y, test_Y_str, word2index, tag2index, index2tag = get_xy(
        path)

    print('************Training Set Summary*************')
    T, V = len(tag2index), len(word2index)
    print('Number of tags: {}, Number of words: {}'.format(T, V))

    print('************Train*************')
    start = time.time()
    emission_weight = get_emission_weight(train_X, train_Y, word2index,
                                          tag2index)
    transition_weight = get_transition_weight(train_Y, tag2index)
    end = time.time()
    time_elapsed(start, end)

    print('************Saving Model Outputs*************')
    path_output = path / 'dev.p2.out'
    viterbi_output(path_output, test_X_str, test_X, tag2index, emission_weight,
                   transition_weight, link_weight_sum)

    prec, rec, f1 = eval(train_X, train_Y_str, tag2index, emission_weight,
                         transition_weight, link_weight_sum)
    print('precision, recall, f1 on training set: {0} {1} {2}'.format(
        prec, rec, f1))
    prec, rec, f1 = eval(test_X, test_Y_str, tag2index, emission_weight,
                         transition_weight, link_weight_sum)
    print('precision, recall, f1 on test set: {0} {1} {2}'.format(
        prec, rec, f1))
Ejemplo n.º 2
0
def UniProt(gene_list, gene_name_list, get_info=True):
    """
    This procedure searches UniProt human polymorphism and disease mutations
    (download to computer) for information about disease.

    Parameter gene_list: a list of genes
    Preconditon: gene_list is a list of genes

    Parameter gene_name_list: a list of gene names
    Preconditon: gene_name_list is a list of Gene names [str]

    Parameter gene_name_set: a set of gene names (used for searching)
    Preconditon: gene_name_set is a set of gene names

    Parameter get_info: a boolean that says whether UniProt should search the
    web for dbSNP information
    Preconditon: get_info is a bool
    """
    if not get_info:
        print("Searching UniProt. This takes a few seconds.")
    else:
        print("Searching UniProt for mutation info. This takes a few minutes.")
    filename = helper.edit_filename(
        "humsavar.txt", "sibling", foldername='databases')
    with open(filename, "r") as g:
        data = g.read()
    start_time = time.time()
    i = 0
    for gene_name in gene_name_list:
        if '\n' + gene_name + " " in data:
            row_list = re.findall('\n' + gene_name + r" .+", data)
            helper.time_elapsed(start_time)
            for row in row_list:
                if "Disease" in row:
                    disease_name = re.search(r"(rs\w+|-)\s+(.+)", row).group(2)
                    ind = gene_name_list.index(gene_name)
                    gene = gene_list[ind]
                    gene.set_disease(disease_name, gene_name,
                                     "UniProt", get_info)
                    if get_info:
                        disease = gene.disease_list()[-1]
                        dbSNP = re.search(r"(rs\w+|-)", row).group(1)
                        aa_change = re.search(r"(p.\w+)", row).group(0)
                        disease.set_amino_change(aa_change)
                        disease.set_SNP(dbSNP)
                        disease.get_SNP_info()
                        if disease.has_full_info():
                            i = i + 1
                        if i == 6:
                            i = 0
                            time.sleep(random.uniform(0.5, 1.25))
                            helper.time_elapsed(start_time)
                            time.sleep(random.uniform(0.5, 1.25))
    print("\nUniProt database complete")
Ejemplo n.º 3
0
def main():
    path = Path('../data/full')
    train_X, train_X_str, train_Y, train_Y_str, dev_X, dev_X_str, dev_Y, dev_Y_str, test_X, test_X_str, test_Y, test_Y_str, word2index, tag2index, postag2index, index2tag = get_xy(
        path)

    print('************Training Set Summary*************')
    T, V, POS = len(tag2index), len(word2index), len(postag2index)
    print('Number of tags: {}, Number of words: {}, Number of pos tags: {}'.
          format(T, V, POS))

    print('************Train*************')
    start = time.time()
    optimal_transition_weight, optimal_emission_weight, optimal_emission_weight_pos, optimal_combination_weight, optimal_combination_weight_pos = trainDecay(
        train_X,
        train_Y,
        dev_X,
        dev_Y_str,
        tag2index,
        word2index,
        postag2index,
        link_weight_sum,
        iteration=20,
        random_seed=1)
    end = time.time()
    time_elapsed(start, end)

    # print('************Saving Model Parameters*************')
    # path_transition = path/'best_weight_features3_transition_strp.npy'
    # path_emission = path/'best_weight_features3_emission_strp.npy'
    # np.save(path_transition, optimal_transition_weight)
    # np.save(path_emission, optimal_emission_weight)

    print('************Evaluation*************')
    prec, rec, f1 = eval(train_X, train_Y_str, tag2index,
                         optimal_emission_weight, optimal_transition_weight,
                         optimal_emission_weight_pos,
                         optimal_combination_weight,
                         optimal_combination_weight_pos, link_weight_sum)
    print('precision, recall, f1 on training set: {0} {1} {2}'.format(
        prec, rec, f1))
    prec, rec, f1 = eval(dev_X, dev_Y_str, tag2index, optimal_emission_weight,
                         optimal_transition_weight,
                         optimal_emission_weight_pos,
                         optimal_combination_weight,
                         optimal_combination_weight_pos, link_weight_sum)
    print('precision, recall, f1 on development set: {0} {1} {2}'.format(
        prec, rec, f1))
    prec, rec, f1 = eval(test_X, test_Y_str, tag2index,
                         optimal_emission_weight, optimal_transition_weight,
                         optimal_emission_weight_pos,
                         optimal_combination_weight,
                         optimal_combination_weight_pos, link_weight_sum)
    print('precision, recall, f1 on test set: {0} {1} {2}'.format(
        prec, rec, f1))
Ejemplo n.º 4
0
def main():
    path = Path('../data/full')
    train_X, train_X_str, train_Y, train_Y_str, test_X, test_X_str, test_Y, test_Y_str, word2index, tag2index, postag2index, index2tag = get_xy(
        path)

    print('************Training Set Summary*************')
    T, V, POS = len(tag2index), len(word2index), len(postag2index)
    print('Number of tags: {}, Number of words: {}, Number of POS tags: {}'.
          format(T, V, POS))

    Lambda = 0.1

    def callbackF(w):
        loss = get_loss_grad(w)[0]
        transition_weight = w[:(T + 1) * (T + 1)].reshape((T + 1, T + 1))
        emission_weight = w[(T + 1) * (T + 1):(T + 1) * (T + 1) +
                            T * V].reshape((T, V))
        emission_weight_pos = w[(T + 1) * (T + 1) + T * V:].reshape((T, POS))
        loss_l2 = LossRegularization(emission_weight,
                                     transition_weight,
                                     emission_weight_pos,
                                     param=Lambda)
        print('Loss:{:.4f} L2 Loss:{:.4f}'.format(loss, loss_l2))

    def get_loss_grad(w):
        with HiddenPrints():
            transition_weight = w[:(T + 1) * (T + 1)].reshape((T + 1, T + 1))
            emission_weight = w[(T + 1) * (T + 1):(T + 1) * (T + 1) +
                                T * V].reshape((T, V))
            emission_weight_pos = w[(T + 1) * (T + 1) + T * V:].reshape(
                (T, POS))
            loss = Loss(train_X,
                        train_Y,
                        tag2index,
                        emission_weight,
                        transition_weight,
                        emission_weight_pos,
                        param=Lambda)
            grads_transition = GradientTransition(train_X,
                                                  train_Y,
                                                  tag2index,
                                                  emission_weight,
                                                  transition_weight,
                                                  emission_weight_pos,
                                                  param=Lambda)
            grads_emission = GradientEmission(train_X,
                                              train_Y,
                                              tag2index,
                                              word2index,
                                              emission_weight,
                                              transition_weight,
                                              emission_weight_pos,
                                              param=Lambda)
            grads_emission_pos = GradientEmissionPOS(train_X,
                                                     train_Y,
                                                     tag2index,
                                                     postag2index,
                                                     emission_weight,
                                                     transition_weight,
                                                     emission_weight_pos,
                                                     param=Lambda)
            grads = np.concatenate(
                (grads_transition.reshape(-1), grads_emission.reshape(-1),
                 grads_emission_pos.reshape(-1)))
        return loss, grads

    print('************Train*************')
    start = time.time()
    init_w = np.zeros(((T + 1) * (T + 1) + T * (V + POS), ))
    optimal_weight, final_loss, result_dict = fmin_l_bfgs_b(get_loss_grad,
                                                            init_w,
                                                            pgtol=0.01,
                                                            callback=callbackF)
    end = time.time()
    time_elapsed(start, end)

    print('************Saving Model Parameters*************')
    optimal_transition_weight = optimal_weight[:(T + 1) * (T + 1)].reshape(
        (T + 1, T + 1))
    optimal_emission_weight = optimal_weight[(T + 1) * (T + 1):(T + 1) *
                                             (T + 1) + T * V].reshape((T, V))
    optimal_emission_weight_pos = optimal_weight[(T + 1) * (T + 1) +
                                                 T * V:].reshape((T, POS))
    path_transition = path / 'best_weight_features2_transition.npy'
    path_emission = path / 'best_weight_features2_emission.npy'
    path_emission_pos = path / 'best_weight_features2_emission_pos.npy'
    np.save(path_transition, optimal_transition_weight)
    np.save(path_emission, optimal_emission_weight)
    np.save(path_emission_pos, optimal_emission_weight_pos)

    print('************Saving Model Outputs*************')
    path_output = path / 'dev.p5.CRF.f3.out'
    viterbi_output(path_output, test_X_str, test_X, tag2index,
                   optimal_emission_weight, optimal_transition_weight,
                   optimal_emission_weight_pos, link_weight_sum)

    print('************Evaluation*************')
    prec, rec, f1 = eval(train_X, train_Y_str, tag2index,
                         optimal_emission_weight, optimal_transition_weight,
                         optimal_emission_weight_pos, link_weight_sum)
    print('precision, recall, f1 on training set: {0} {1} {2}'.format(
        prec, rec, f1))
    prec, rec, f1 = eval(test_X, test_Y_str, tag2index,
                         optimal_emission_weight, optimal_transition_weight,
                         optimal_emission_weight_pos, link_weight_sum)
    print('precision, recall, f1 on test set: {0} {1} {2}'.format(
        prec, rec, f1))
Ejemplo n.º 5
0
def NLM(gene_list, gene_name_list, gene_name_set):
    """
    This procedure searches the GHR NLM database of genes and finds all of the
    genes that are in a list of genes.

    The database can be found here: https://ghr.nlm.nih.gov/gene
    Parameter gene_list: a list of genes
    Preconditon: gene_list is a list of genes

    Parameter gene_name_list: a list of gene names
    Preconditon: gene_name_list is a list of Gene names [str]

    Parameter gene_name_set: a set of gene names (used for searching)
    Preconditon: gene_name_set is a set of gene names

    Parameter start_time: the time this function began. This is used to calculate
    the amount of time this function takes to execute and print it on the screen.
    Preconditon: start_time is a time object.
    """
    url1 = "https://ghr.nlm.nih.gov/gene?initial="
    alphabet = list(ascii_lowercase)  # list of letters
    ua = UserAgent()  # Needed so the website won't block my acesss; generates
    # a thingy that looks just look regular chrome.
    this_list = []
    print("Searching NLM website. This takes a few seconds.")
    start_time = time.time()
    for let in alphabet:
        headers = {"User-Agent": ua.random}  # uses the thingy I generated
        request = Request(url1 + let, headers=headers)  # request website
        response = urlopen(request)  # open the website
        respData = response.read()  # read contents on the website
        response.close()  # close the website
        page_soup = soup(respData, "html.parser")  # get HTML from website
        raw_info = page_soup.findAll("ul", {"class": "browse-results"})[0].text
        # Useful function that I need to learn to use better.
        this_list = this_list + \
            re.findall("(" + let.upper() + "[A-Z\d]+): \w", raw_info)
        helper.time_elapsed(start_time)
    for gene_name in this_list:
        helper.time_elapsed(start_time)
        url2 = "https://ghr.nlm.nih.gov/gene/"
        if gene_name in gene_name_set:
            try:
                ind = gene_name_list.index(gene_name)
                gene = gene_list[ind]
                headers = {"User-Agent": ua.random}
                request = Request(url2 + gene_name +
                                  "#conditions", headers=headers)
                response = urlopen(request)
                respData = response.read()
                response.close()
                page_soup = soup(respData, "html.parser")
                dis1 = re.findall(
                    "Health Conditions Related to Genetic Changes\s+(([\w+,-]+ )+)", page_soup.text)
                disease = dis1[0][0]
                gene.set_disease(disease, gene_name, "NLM")
                diseases = re.findall("More About This Health Condition\s+(([\w+,-]+ )+)",
                                      page_soup.text)[0:-1]
                for disease in diseases:
                    gene.set_disease(disease[0], gene_name, "NLM")
            except IndexError:
                pass
    print("\nNLM database complete.")