Ejemplo n.º 1
0
    QC = str(case_SeqInfo['QC'])
    
    # PS1
    PS_set = ['PS1', 'PS2']
    for PS in PS_set:
        HLATyping = case_SeqInfo[PS]['HLATyping']
        Donor = case_SeqInfo[PS]['Donor']
        Recipient = case_SeqInfo[PS]['Recipient']
        record = (BMT_caseID, Audit, Active, Comment, QC, HLATyping, PS, Donor, Recipient, )
        cursor.execute('INSERT INTO OriginalSeqs VALUES (?,?,?,?,?,?,?,?,?)', record)
        conn.commit()

conn.close()

fname = output + 'SG41_52_HLA_' + locus + '_paired'
IMGTdbIO.save_dict2pickle(available_records, fname)

#aa = IMGTdbIO.load_pickle2dict(fname, output)


################# 
# Class II
#################
all_DB_files = glob.glob("../Output/SG41_52/2018/IMGTv3310/AvailDB/*.db")
db_file = all_DB_files[4] ## 0: DPB1 2:DRB1 5:DQB1

locus = db_file.split('_')[4]

conn = sql.connect(db_file) # automatically creates a file if doesn't exist
conn.row_factory = sql.Row  # Each row is a dictionary: {colNames: Value}
cursor = conn.cursor()
Ejemplo n.º 2
0
@author: hhuang2
"""

# import glob
import sqlite3 as sql
# from utils import phase_block_check as ps
from utils import IMGTdbIO, CompareSeq
import os
import re

locus = 'DQB1'

#pkl_fp = '../Output/SG39_DRpairs/SG39_HLA_'+ locus +'_paired.pkl'
pkl_fp = '../Output/SG39/2018/SG39_DRpairs/SG39_HLA_' + locus + '_paired.pkl'

DRpair_seqInfo = IMGTdbIO.load_pickle2dict(pkl_fp)

case_count = len(DRpair_seqInfo)
print('Locus ' + locus + ' has ' + str(case_count) + ' paired cases.')

DB_fp = "../Output/SG39/2018/SG39_DRpairs/SG39_HLA_" + locus + "_paired.db"
conn = sql.connect(DB_fp)
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS DR_pair_comparison 
               (BMT_caseID text, QC text, 
               PS1_HLATyping text, 
               PS1_GLstringM text, PS1_SeqM text, 
               PS2_HLATyping text,
               PS2_GLstringM text, PS2_SeqM text,
               Audit text, Active text, Comment text)''')
Ejemplo n.º 3
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Oct  8 13:24:16 2017

@author: hhuang2
"""
from utils import IMGTdbIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

typing1 = 'A*23:17'
Refseq1 = IMGTdbIO.readIMGTsql(typing1, db_fp= '../Database/', field = 'Exon1, Exon2, Exon3, Exon4, Exon5, Exon6, Exon7, Exon8')

typing2 = 'A*23:01:01'
Refseq2 = IMGTdbIO.readIMGTsql(typing2, db_fp= '../Database/', field = 'Exon1, Exon2, Exon3, Exon4, Exon5, Exon6, Exon7, Exon8')

coding_dna = Seq(Refseq1, generic_dna)
coding_dna.translate()

str(coding_dna.translate())

seq = ''
for i in range(len(Refseq1)):
    seq += Refseq1[i]
    
    
typing1 = 'A*23:17'
typing2 = 'A*23:01:01'
HLAtyping = typing1+'_'+typing2
Exons = 'Exon1, Exon2, Exon3, Exon4, Exon5, Exon6, Exon7, Exon8'
Ejemplo n.º 4
0
"""

import glob
import csv

from utils import IMGTdbIO, CompareSeq

groupType = 'fiveLoci_paired'  # groupType = 'ClassI_paired' # groupType = 'All_paired' ; 'fiveLoci_paired'

All_loci = ['A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1']
Five_loci = ['A', 'B', 'C', 'DRB1', 'DQB1']
ClassI_loci = ['A', 'B', 'C']
ClassII_loci = ['DRB1', 'DQB1', 'DPB1']

Group_fname = '../Output/SG39/2018/SG39_Stats/fiveLoci_paired_Stats_0125_' + groupType + '.pkl'
Stats_Dict = IMGTdbIO.load_pickle2dict(Group_fname)

CaseStats = Stats_Dict['CaseStats']
LocusStats = Stats_Dict['LocusStats']

db_fp = '../Database/'
#key = '84571'
#CaseStats[key]
#key in group_caseIDs

## : paired cases HLA typing stats
fname = '../Output/SG39/2018/SG39_DRpairs/SG39_pairedCases_Stats.pkl'
Matching_cases_stats = IMGTdbIO.load_pickle2dict(fname)

CaseMatchTable = {}
for locus in All_loci:
"""
Created on Tue Oct  3 14:42:06 2017

@author: hhuang2
"""

import glob
import sqlite3 as sql
# from utils import phase_block_check as ps
from utils import IMGTdbIO, CompareSeq
import os
import re


fname = '../Output/SG41_52/2018/IMGTv3310/SG41_52_DRpair_Stats/SG41_52_pairedCases_Stats.pkl'
Matching_cases_stats = IMGTdbIO.load_pickle2dict(fname)

## 'All_paired'
groupType = 'fiveLoci_paired' # groupType = 'ClassI_paired' # groupType = 'All_paired'
group_caseIDs = Matching_cases_stats[groupType]
All_loci = ['A', 'B', 'C', 'DRB1', 'DQB1']#, 'DPB1']
ClassI_loci = ['A', 'B', 'C']
ClassII_loci = ['DRB1', 'DQB1']

CaseStats = {}
LocusStats = {}
#MatchStats = {}
for caseID in group_caseIDs:
    # 
    for locus in ClassI_loci:
        ARSregion = ['Exon2', 'Exon3']
def check_DQB102_Block_seq(seq_count,
                           tplist,
                           unique_Query,
                           unique_HLATyping_list,
                           ID,
                           version="3310"):
    '''
    Two blocks one phase sequences
    '''
    if type(ID) == float:
        ID = str(int(ID))
    Locus = tplist[0].split("*")[0]
    ARS0seq = IMGTdbIO.readIMGTsql(tplist[0],
                                   field='Exon2, Exon3',
                                   version=version)
    ARS1seq = IMGTdbIO.readIMGTsql(tplist[1],
                                   field='Exon2, Exon3',
                                   version=version)

    serotype = [tp.split(":")[0] for tp in tplist]

    if seq_count > 3:
        print(
            "Please check the ID: " + ID + " Locus " + Locus +
            ", have heterozygotic DQB1*02 types or have more sequences than expected."
        )

    QueryTyping = {}
    for seq_item in unique_Query:
        # PS1
        if ARS0seq[0] in seq_item:  # PS1 Exon 2
            if serotype[0] == "DQB1*02":  # DQB1*02 - 2 blocks
                if "PS1" not in QueryTyping.keys():
                    QueryTyping["PS1"] = {
                        "GLstring": unique_HLATyping_list[0],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:  # altered block order
                    QueryTyping["PS1"] = {
                        "GLstring": unique_HLATyping_list[0],
                        "Sequence":
                        [seq_item, QueryTyping["PS1"]["Sequence"][0]],
                        "blockIDs": [1, 2]
                    }
            else:  # non-DQB1 - 1 block
                if "PS1" not in QueryTyping.keys():
                    QueryTyping["PS1"] = {
                        "GLstring": unique_HLATyping_list[0],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS1"]['Sequence'].append(seq_item)
                    QueryTyping["PS1"]['blockIDs'].append(2)

        elif ARS0seq[1] in seq_item:  # PS1 Exon 3
            if serotype[0] == "DQB1*02":  # DQB1*02 - 2 blocks
                if "PS1" not in QueryTyping.keys():
                    QueryTyping["PS1"] = {
                        "GLstring": unique_HLATyping_list[0],
                        "Sequence": [seq_item],
                        "blockIDs": [2]
                    }
                else:
                    QueryTyping["PS1"]['Sequence'].append(seq_item)
                    QueryTyping["PS1"]['blockIDs'].append(2)
            else:  # non-DQB1 - 1 block
                if "PS1" not in QueryTyping.keys():
                    QueryTyping["PS1"] = {
                        "GLstring": unique_HLATyping_list[0],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS1"]['Sequence'].append(seq_item)
                    QueryTyping["PS1"]['blockIDs'].append(2)

        ## PS2
        elif ARS1seq[0] in seq_item:  # PS2 Exon 2
            if serotype[0] == "DQB1*02":  # DQB1*02 - 2 blocks
                if "PS2" not in QueryTyping.keys():
                    QueryTyping["PS2"] = {
                        "GLstring": unique_HLATyping_list[1],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:  # altered block order
                    QueryTyping["PS2"] = {
                        "GLstring": unique_HLATyping_list[1],
                        "Sequence":
                        [seq_item, QueryTyping["PS2"]["Sequence"][0]],
                        "blockIDs": [1, 2]
                    }
            else:  # non-DQB1 - 1 block
                if "PS2" not in QueryTyping.keys():
                    QueryTyping["PS2"] = {
                        "GLstring": unique_HLATyping_list[1],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS2"]['Sequence'].append(seq_item)
                    QueryTyping["PS2"]['blockIDs'].append(2)

        elif ARS1seq[1] in seq_item:  # PS2 Exon 3
            if serotype[0] == "DQB1*02":  # DQB1*02 - 2 blocks
                if "PS2" not in QueryTyping.keys():
                    QueryTyping["PS2"] = {
                        "GLstring": unique_HLATyping_list[1],
                        "Sequence": [seq_item],
                        "blockIDs": [2]
                    }
                else:
                    QueryTyping["PS2"]['Sequence'].append(seq_item)
                    QueryTyping["PS2"]['blockIDs'].append(2)
            else:  # non-DQB1 - 1 block
                if "PS2" not in QueryTyping.keys():
                    QueryTyping["PS2"] = {
                        "GLstring": unique_HLATyping_list[1],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS2"]['Sequence'].append(seq_item)
                    QueryTyping["PS2"]['blockIDs'].append(2)

        else:
            QueryTyping["PS3"] = {
                "GLstring": unique_HLATyping_list,
                "Sequence": [seq_item],
                "blockIDs": [1]
            }
            print(ID + ": The sequence at Locus " + Locus +
                  " doesn't match to either of the Typings")

    if "PS1" in QueryTyping.keys() and "PS2" not in QueryTyping.keys(
    ):  ## Homozygous
        QueryTyping["PS2"] = QueryTyping["PS1"]

    return (QueryTyping)
def check_twoBlock_seq(seq_count,
                       tplist,
                       unique_Query,
                       unique_HLATyping_list,
                       ID,
                       version="3310"):
    '''
    Two blocks one phase sequences
    '''
    if type(ID) == float:
        ID = str(int(ID))
    Locus = tplist[0].split("*")[0]
    ARS0seq = IMGTdbIO.readIMGTsql(tplist[0],
                                   field='Exon2, Exon3',
                                   version=version)
    ARS1seq = IMGTdbIO.readIMGTsql(tplist[1],
                                   field='Exon2, Exon3',
                                   version=version)

    if seq_count > 4:
        print("Please check the ID: " + ID + " Locus " + Locus +
              "! More sequences than expected.")

    QueryTyping = {}
    for seq_item in unique_Query:
        if ARS0seq[0] in seq_item:  # the first type; block 1; exon2
            if "PS1" not in QueryTyping.keys():
                QueryTyping["PS1"] = {
                    "GLstring": unique_HLATyping_list[0],
                    "Sequence": [seq_item],
                    "blockIDs": [1]
                }
            else:  # altered block order
                QueryTyping["PS1"] = {
                    "GLstring": unique_HLATyping_list[0],
                    "Sequence": [seq_item, QueryTyping["PS1"]["Sequence"][0]],
                    "blockIDs": [1, 2]
                }
        elif ARS0seq[1] in seq_item:  # the first type; block 2; exon3
            if "PS1" not in QueryTyping.keys():
                QueryTyping["PS1"] = {
                    "GLstring": unique_HLATyping_list[0],
                    "Sequence": [seq_item],
                    "blockIDs": [2]
                }
            else:
                QueryTyping["PS1"]['Sequence'].append(seq_item)
                QueryTyping["PS1"]['blockIDs'].append(2)

        elif ARS1seq[0] in seq_item:  # second type; block 1; exon2
            if "PS2" not in QueryTyping.keys():
                QueryTyping["PS2"] = {
                    "GLstring": unique_HLATyping_list[1],
                    "Sequence": [seq_item],
                    "blockIDs": [1]
                }
            else:
                QueryTyping["PS2"] = {
                    "GLstring": unique_HLATyping_list[1],
                    "Sequence": [seq_item, QueryTyping["PS2"]["Sequence"][0]],
                    "blockIDs": [1, 2]
                }
        elif ARS1seq[1] in seq_item:  # second type; block2; exon3
            if "PS2" not in QueryTyping.keys():
                QueryTyping["PS2"] = {
                    "GLstring": unique_HLATyping_list[1],
                    "Sequence": [seq_item],
                    "blockIDs": [2]
                }
            else:
                QueryTyping["PS2"]['Sequence'].append(seq_item)
                QueryTyping["PS2"]['blockIDs'].append(2)
        else:
            QueryTyping["PS3"] = {
                "GLstring": unique_HLATyping_list,
                "Sequence": [seq_item],
                "blockIDs": [1]
            }
            print(ID + ": The sequence at Locus " + Locus +
                  " doesn't match to either of the Typings")

    if "PS1" in QueryTyping.keys() and "PS2" not in QueryTyping.keys(
    ):  ## Homozygous
        QueryTyping["PS2"] = QueryTyping["PS1"]

    return (QueryTyping)
def check_oneBlock_seq(seq_count,
                       tplist,
                       unique_Query,
                       unique_HLATyping_list,
                       ID,
                       version="3310"):
    '''
    For one block one phase sequence
    '''
    if type(ID) == float:
        ID = str(int(ID))
    Locus = tplist[0].split("*")[0]

    ARS0seq = IMGTdbIO.readIMGTsql(tplist[0],
                                   field='Exon2, Exon3',
                                   version=version)
    ARS1seq = IMGTdbIO.readIMGTsql(tplist[1],
                                   field='Exon2, Exon3',
                                   version=version)

    if seq_count > 2:
        print("Please check the ID: " + ID + " Locus " + Locus +
              "! More sequences than expected.")

    QueryTyping = {}
    for seq_item in unique_Query:

        if ARS0seq != ARS1seq:  #  if the two types have different ARS regions

            if ARS0seq[0] in seq_item and ARS0seq[
                    1] in seq_item:  # the first type
                if "PS1" not in QueryTyping.keys():
                    QueryTyping["PS1"] = {
                        "GLstring": unique_HLATyping_list[0],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS1"]['Sequence'].append(seq_item)
                    QueryTyping["PS1"]['blockIDs'].append(2)
            elif ARS1seq[0] in seq_item and ARS1seq[
                    1] in seq_item:  # second type
                if "PS2" not in QueryTyping.keys():
                    QueryTyping["PS2"] = {
                        "GLstring": unique_HLATyping_list[1],
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS2"]['Sequence'].append(seq_item)
                    QueryTyping["PS2"]['blockIDs'].append(2)
            else:
                if "PS3" not in QueryTyping.keys():
                    QueryTyping["PS3"] = {
                        "GLstring": unique_HLATyping_list,
                        "Sequence": [seq_item],
                        "blockIDs": [1]
                    }
                else:
                    QueryTyping["PS3"]['Sequence'].append(seq_item)
                    QueryTyping["PS3"]['blockIDs'].append(2)
                print(ID + ": The sequence at Locus " + Locus +
                      " doesn't match to either of the Typings")

        else:  #  if the two types have the same ARS regions
            ARS0seq1456 = IMGTdbIO.readIMGTsql(
                tplist[0], field='Exon1, Exon4, Exon5, Exon6', version=version)
            ARS1seq1456 = IMGTdbIO.readIMGTsql(
                tplist[1], field='Exon1, Exon4, Exon5, Exon6', version=version)
            if ARS0seq1456 != ARS1seq1456:
                if ARS0seq1456[0] in seq_item and ARS0seq1456[
                        1] in seq_item and ARS0seq1456[
                            2] in seq_item and ARS0seq1456[
                                3] in seq_item:  # the first type
                    if "PS1" not in QueryTyping.keys():
                        QueryTyping["PS1"] = {
                            "GLstring": unique_HLATyping_list[0],
                            "Sequence": [seq_item],
                            "blockIDs": [1]
                        }
                    else:
                        QueryTyping["PS1"]['Sequence'].append(seq_item)
                        QueryTyping["PS1"]['blockIDs'].append(2)

                elif ARS1seq1456[0] in seq_item and ARS1seq1456[
                        1] in seq_item and ARS1seq1456[
                            2] in seq_item and ARS1seq1456[
                                3] in seq_item:  # second type
                    if "PS2" not in QueryTyping.keys():
                        QueryTyping["PS2"] = {
                            "GLstring": unique_HLATyping_list[1],
                            "Sequence": [seq_item],
                            "blockIDs": [1]
                        }
                    else:
                        QueryTyping["PS2"]['Sequence'].append(seq_item)
                        QueryTyping["PS2"]['blockIDs'].append(2)
                else:
                    if "PS3" not in QueryTyping.keys():
                        QueryTyping["PS3"] = {
                            "GLstring": unique_HLATyping_list,
                            "Sequence": [seq_item],
                            "blockIDs": [1]
                        }
                    else:
                        QueryTyping["PS3"]['Sequence'].append(seq_item)
                        QueryTyping["PS3"]['blockIDs'].append(2)
                    print(ID + ": The sequence at Locus " + Locus +
                          " doesn't match to either of the Typings")
            else:
                ARS0seq7 = IMGTdbIO.readIMGTsql(tplist[0],
                                                field='Exon7',
                                                version=version)
                ARS1seq7 = IMGTdbIO.readIMGTsql(tplist[1],
                                                field='Exon7',
                                                version=version)
                if ARS0seq7 != ARS1seq7:
                    if ARS0seq7[0] in seq_item:  # the first type
                        if "PS1" not in QueryTyping.keys():
                            QueryTyping["PS1"] = {
                                "GLstring": unique_HLATyping_list[0],
                                "Sequence": [seq_item],
                                "blockIDs": [1]
                            }
                        else:
                            QueryTyping["PS1"]['Sequence'].append(seq_item)
                            QueryTyping["PS1"]['blockIDs'].append(2)

                    elif ARS1seq7[0] in seq_item:  # second type
                        if "PS2" not in QueryTyping.keys():
                            QueryTyping["PS2"] = {
                                "GLstring": unique_HLATyping_list[1],
                                "Sequence": [seq_item],
                                "blockIDs": [1]
                            }
                        else:
                            QueryTyping["PS2"]['Sequence'].append(seq_item)
                            QueryTyping["PS2"]['blockIDs'].append(2)
                    else:
                        QueryTyping["PS3"] = {
                            "GLstring": unique_HLATyping_list,
                            "Sequence": [seq_item],
                            "blockIDs": [1]
                        }
                        print(ID + ": The sequence at Locus " + Locus +
                              " doesn't match to either of the Typings")
                else:  ## all 8 exons are the same
                    if "PS1" not in QueryTyping.keys():
                        QueryTyping["PS1"] = {
                            "GLstring": unique_HLATyping_list[0],
                            "Sequence": [seq_item],
                            "blockIDs": [1]
                        }
                    elif "PS2" not in QueryTyping.keys():
                        QueryTyping["PS2"] = {
                            "GLstring": unique_HLATyping_list[1],
                            "Sequence": [seq_item],
                            "blockIDs": [1]
                        }
                    else:
                        QueryTyping["PS1"]['Sequence'].append(seq_item)
                        QueryTyping["PS1"]['blockIDs'].append(2)
                    print(
                        ID + ": The sequence at Locus " + Locus +
                        " two typings have exactly the same Exon sequences. Cannot distinguish by Exons."
                    )

    if "PS1" in QueryTyping.keys() and "PS2" not in QueryTyping.keys(
    ):  ## Homozygous
        QueryTyping["PS2"] = QueryTyping["PS1"]

    return (QueryTyping)
Ejemplo n.º 9
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Feb  3 12:28:22 2018

@author: hhuang2
"""
import sqlite3 as sql
from utils import IMGTdbIO

version = '3310'
IMGTdbIO.buildIMGTsql('DQB1', version=version, output_fp="../Database/")

locus = 'DQB1'
db_fp = '../Database/'

filename = db_fp + "IMGT-" + version + "_HLA-" + locus + ".db"

con = sql.connect(filename)
cur = con.cursor()
field1 = 'HLATyping'
cur.execute('SELECT ' + field1 + ' FROM Sequences')
Typings_temp = cur.fetchall()
count = 0
field2 = 'AlignedGenomSeq'
for tp in Typings_temp:
    cur.execute('SELECT ' + field2 + ' FROM Sequences WHERE HLATyping = ?', tp)
    sequences_temp = cur.fetchone()
    if sequences_temp[0] != '':
        count += 1
Ejemplo n.º 10
0
@author: hhuang2
"""

from utils import IMGTdbIO  #, CompareSeq
from collections import Counter

groupType = 'All_paired'  # groupType = 'ClassI_paired' # groupType = 'All_paired'

All_loci = ['A', 'B', 'C', 'DRB1', 'DQB1', 'DPB1']
Five_loci = ['A', 'B', 'C', 'DRB1', 'DQB1']
ClassI_loci = ['A', 'B', 'C']
ClassII_loci = ['DRB1', 'DQB1', 'DPB1']

Group_fname = '../Output/Stats/ClassI_Stats_1003_' + groupType + '.pkl'
Stats_Dict = IMGTdbIO.load_pickle2dict(Group_fname)

CaseStats = Stats_Dict['CaseStats']
LocusStats = Stats_Dict['LocusStats']

## TODO1: paired cases HLA typing stats
fname = '../Output/SG39_DRpairs/SG39_pairedCases_Stats.pkl'
Matching_cases_stats = IMGTdbIO.load_pickle2dict(fname)

AlleleStats = {}
for locus in All_loci:
    AlleleStats[locus] = {}
    DRpaired_file = '../Output/SG39_DRpairs/SG39_HLA_' + locus + '_paired.pkl'

    DRpaired_table = IMGTdbIO.load_pickle2dict(DRpaired_file)
    num_total = len(DRpaired_table)