Beispiel #1
0
def MongotoPTMannotation(proteinIDs, Tag_FTs, output_types, output_prefix):
    table = functions.connectMongoDB('test', 'table')
    entry = functions.connectMongoDB('uniprot', 'entry')
    file = []
    out_data = ''

    if not os.path.exists(output_prefix):
        os.makedirs(output_prefix)

    for index, tag in enumerate(Tag_FTs):
        file.append(open(output_prefix + '/' + tag + '.fasta', 'w'))

    for id in proteinIDs:
        ptm = table.find_one({'_id': id})
        ft_index = []
        print(ptm)
        for index, ft in enumerate(Tag_FTs):
            # ft = re.sub('[.]', '',ft) #take off .
            unfold_ft = ft.split(" ")

            for new_ft in unfold_ft:
                if new_ft in ptm:
                    ft_index.extend(ptm[new_ft])

            if len(ft_index) >= 1:
                sequence = ptm['sequence']
                if output_types == 1:  #DUOLIN
                    out_data = duolin(ptm['_id'], ft_index, sequence)
                else:  #CHUNHUI
                    out_data = chunhui(ptm['_id'], sequence)
                file[index].write(out_data)

    for index, tag in enumerate(Tag_FTs):
        file[index].close()
Beispiel #2
0
def main():

    dbname = 'uniprot'
    colname = 'table'

    collection = functions.connectMongoDB(dbname, colname)
    results = collection.find({})
    display = {
        'Phosphoserine': 0,
        'Phosphothreonine': 0,
        'Phosphotyrosine': 0,
        'N6-acetyllysine': 0,
        'Omega-N-methylarginine': 0,
        'N6-methyllysine': 0,
        'N6,N6-dimethyllysine': 0,
        'N6,N6,N6-trimethyllysine': 0,
        'N-linked(GlcNAc)asparagine': 0,
        'S-palmitoylcysteine': 0,
        'Pyrrolidonecarboxylicacid': 0,
        'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': 0
    }
    for data in results:
        for ptm in display:
            if ptm in data:
                display[ptm] += len(data[ptm])
    print(display)
    with open('display.txt', 'w') as outfile:
        json.dump(display, outfile)
Beispiel #3
0
def ptmPosition(Tag_FTs):
    table = functions.connectMongoDB('uniprot', 'table')
    out_data = ''
    file = []
    if not os.path.exists("data"):
        os.makedirs("data")

    for index, tag in enumerate(Tag_FTs):
        file.append(open('data/' + tag + '.txt', 'w'))

    with open("format8.txt") as fp:
        for line in fp:
            collapse = ' '.join(line.split())
            parse = collapse.split(" ")
            id = parse[1]
            ptm = table.find_one({'_id': id})
            ptm_pos = []
            for index, ft in enumerate(Tag_FTs):
                if ft in ptm:
                    ptm_pos.extend(ptm[ft])

                relative_positions = calc_psition(int(parse[6]), int(parse[8]),
                                                  int(parse[9]), ptm_pos)

                out_data = prepare(ptm['_id'], relative_positions)
                file[index].write(out_data)

    for index, tag in enumerate(Tag_FTs):
        file[index].close()
Beispiel #4
0
def get_ids(sp):
    ids = []
    table = functions.connectMongoDB('uniprot', 'table')
    cursor = table.find()
    for doc in cursor:
        if doc['species'] and sp in doc['species']:
            ids.append(doc['_id'])
    return ids
Beispiel #5
0
def ubiquitin(filepath,fts):
	table = functions.connectMongoDB('uniprot','ubiquitinTable')
	# Open a file
	ac_flag = 0
	out_ac = []
	out_position = []
	out_data = dict()
	special = 0
	specials = ['Glycyllysineisopeptide(Lys-Gly)','Peptide(Met-Gly)(interchainwithG-Cter','Glycylserineester(Ser-Gly)','Glycylcysteinethioester(Cys-Gly)']
	with open(filepath) as fp:
		for line in fp:
			collapsed = ' '.join(line.split())
			data = collapsed.split(";")
			parsed_1 = data[0].split(" ")
			if parsed_1[0] == "AC" and  ac_flag == 0:
				ac_flag = 1	
				out_ac.append(parsed_1[1])
				if len(data)  > 2:
					for x in range(1, len(data)-1):
						out_ac.append(data[x])
				out_data = {'ac':out_ac}
			##[go,interpro,pfam,prosite,smart,supfam]
			elif parsed_1[0] == "FT":
				if len(parsed_1) > 4 and special == 0:
					ft = ''
					for i in range(4,len(parsed_1)):
						ft = ft + parsed_1[i]
					ft = re.sub('[.]', '', ft)
					out_position = functions.remove_duplicates([parsed_1[2],parsed_1[3]])
					if ft in specials:
						special = 1
						continue
					if ft in fts:
						fts.setdefault(ft, []).append(out_position)
						out_position = []
				elif special == 1:
					for i in range(1,len(parsed_1)):
						ft = ft + parsed_1[i]
					ft = re.sub('[.]', '', ft)
					if ft in fts:
						fts.setdefault(ft, []).append(out_position)
						out_position = []
					special = 0
			elif parsed_1[0] == '//':
				fts = dict( [(k,list(itertools.chain.from_iterable(v))) for k,v in fts.items() if len(v)>0]) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format 
				out_data = functions.merge_two_dicts(out_data,fts)
				#print(out_data)
				table.save(out_data)
				fts = {'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylserineester(Ser-Gly)(interchainwithG-Cterinubiquitin)':[],
                'Peptide(Met-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylcysteinethioester(Cys-Gly)(interchainwithG-Cterinubiquitin)':[]}
				
				##rewind
				out_ac = []
				ac_flag = 0
				out_position = []
				
	fp.close()
Beispiel #6
0
def db_to_fasta(output_prefix):
    entry = functions.connectMongoDB('uniprot', 'table')
    out_data = ''
    out_file = open(output_prefix + '.fasta', 'w')
    entrys = entry.find({})

    for doc in entrys:
        out_data = prepareData(doc['_id'], doc['sequence'])
        out_file.write(out_data)

    out_file.close()
Beispiel #7
0
def main():
    dbname = 'uniprot'
    colname = 'ubiquitinTable'
    collection = functions.connectMongoDB(dbname, colname)
    write2file = []
    takeoff = ['_id', 'ac']
    ##
    with open('positionInfo.txt', 'w') as outfile:
        for ac in acs:
            result = collection.find_one({'ac': ac})
            print(ac)
            temp = {'AC': ac}
            for output in result:
                if output not in takeoff:
                    temp[output] = result[output]
            write2file.append(temp)
        json.dump(write2file, outfile)
Beispiel #8
0
def blast_output(output_prefix):
    table = functions.connectMongoDB('uniprot', 'table')
    if not os.path.exists(output_prefix):
        os.makedirs(output_prefix)
    out_file = open(output_prefix + '/input1.txt', 'w')
    current = " "
    with open("format8.txt") as fp:
        for line in fp:
            collapse = ' '.join(line.split())
            parse = collapse.split(" ")
            if current != parse[0]:
                current = parse[0]
                seq = get_query_seq(current)
                q_id = '{:14}'.format(current)
                out_file.write(q_id + seq + "\n")
            pid = '{:14}'.format(parse[1])
            p_doc = table.find_one({'_id': parse[1]})
            p_seq = p_doc["sequence"]
            start = int(parse[8]) - 1
            end = int(parse[9])
            p_seq = p_seq[start:end]
            p_seq = fillup(int(parse[6]) - 1, p_seq)
            out_file.write(pid + p_seq + "\n")
    out_file.close()
def tableGeneration(filepath, fts):
    table = functions.connectMongoDB('uniprot', 'table')
    # Open a file
    id_flag = 0
    ac_flag = 0
    out_ac = []
    out_position = []
    out_data = dict()
    special = 0
    sequence = ''
    with open(filepath) as fp:
        for line in fp:
            collapsed = ' '.join(line.split())
            data = collapsed.split(";")
            parsed_1 = data[0].split(" ")
            if parsed_1[0] == "ID" and id_flag == 0:
                id_flag = 1
                out_id = parsed_1[1]
            elif parsed_1[0] == "AC" and ac_flag == 0:
                ac_flag = 1
                out_ac.append(parsed_1[1])
                if len(data) > 2:
                    for x in range(1, len(data) - 1):
                        out_ac.append(data[x])
                out_data = {'_id': out_id, 'ac': out_ac}
            ##[go,interpro,pfam,prosite,smart,supfam]
            elif parsed_1[0] == "FT":
                if len(parsed_1) > 4 and special == 0:
                    ft = ''
                    for i in range(4, len(parsed_1)):
                        ft = ft + parsed_1[i]
                    ft = re.sub('[.]', '', ft)
                    out_position = functions.remove_duplicates(
                        [parsed_1[2], parsed_1[3]])
                    if ft == 'Glycyllysineisopeptide(Lys-Gly)':
                        special = 1
                        continue
                    if ft in fts:
                        fts.setdefault(ft, []).append(out_position)
                        out_position = []
                elif special == 1:
                    for i in range(1, len(parsed_1)):
                        ft = ft + parsed_1[i]
                    ft = re.sub('[.]', '', ft)
                    if ft in fts:
                        fts.setdefault(ft, []).append(out_position)
                        out_position = []
                    special = 0
            ##
            ## parse_1[0] is usually RT,DR,FT,or SQ etc... only squence part has length greater than 2
            elif len(parsed_1[0]) > 2:
                sequence += collapsed
            elif parsed_1[0] == '//':
                fts = dict(
                    [(k, list(itertools.chain.from_iterable(v)))
                     for k, v in fts.items() if len(v) > 0]
                )  #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format
                out_data = functions.merge_two_dicts(out_data, fts)
                sequence = ''.join(sequence.split())
                out_data['sequence'] = sequence
                #print(out_data)
                table.save(out_data)
                fts = {
                    'Phosphoserine': [],
                    'Phosphothreonine': [],
                    'Phosphotyrosine': [],
                    'N6-acetyllysine': [],
                    'Omega-N-methylarginine': [],
                    'N6-methyllysine': [],
                    'N6,N6-dimethyllysine': [],
                    'N6,N6,N6-trimethyllysine': [],
                    'N-linked(GlcNAc)asparagine': [],
                    'S-palmitoylcysteine': [],
                    'Pyrrolidonecarboxylicacid': [],
                    'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)':
                    [],
                    'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':
                    []
                }

                ##rewind
                out_ac = []
                id_flag = 0
                ac_flag = 0
                out_position = []
                sequence = ''
    fp.close()
Beispiel #10
0
def tableGeneration(filepath, ptms):
    table = functions.connectMongoDB('uniprot', 'table')
    table.drop()
    out_id = ""
    out_ac = []
    out_position = []
    out_data = dict()
    sequence = ""
    temp_ptm = ""
    prev_fp_pos = 0
    check = []

    fp = open(filepath)
    line = fp.readline()

    while line:
        collapsed = ' '.join(line.split())
        data = collapsed.split(";")
        info = data[0].split(" ")
        tag = info[0]
        #print(info[0]+" info1 "+info[1]+"\n")
        if tag == "ID":
            out_id = info[1]
        elif tag == "AC":
            out_ac.append(info[1])
            if len(data) > 2:
                for x in range(1, len(data) - 1):
                    out_ac.append(data[x].lstrip())
        elif tag == "OC":
            check.append(info[1].lstrip())
            if len(data) > 2:
                for x in range(1, len(data) - 1):
                    check.append(data[x].lstrip())
            out_data = {"_id": out_id, "ac": out_ac, "species": check}
        elif tag == "FT":
            temp_ptm = ""
            out_position = functions.remove_duplicates([info[2], info[3]])
            temp_ptm = " ".join(info[4:])
            #if "Q9TT90" in out_ac:
            #	print("################temp_ptm is 1 "+temp_ptm+"\n")
            prev_fp_pos = fp.tell()
            line = ' '.join(fp.readline().split())
            info = line.split(" ")
            while info[0] == "FT":
                if len(info) > 3 and is_number(info[2]) and is_number(info[3]):
                    #if "Q9TT90" in out_ac:
                    #    print("###########temp_ptm is 2 "+temp_ptm+"\n")
                    temp_ptm = re.sub('(\.*)\)', ')', temp_ptm)
                    for doc in ptms:
                        #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)':
                        #	print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n")
                        if doc == re.sub('[\.|\;].*', '', temp_ptm):
                            #if "Q9TT90" in out_ac:
                            #	print("yes\n")
                            ptms.setdefault(doc, []).append(out_position)
                    temp_ptm = ""
                    out_position = functions.remove_duplicates(
                        [info[2], info[3]])
                    temp_ptm = " ".join(info[4:])
                else:
                    temp_ptm = temp_ptm + " ".join(info[1:])
                    #if "Q9TT90" in out_ac:
                    #    print("#################temp_ptm is 3 "+temp_ptm+"\n")
                    #for i in range(1,len(info)):
                    #	temp_ptm += info[i].rstrip()
                    #print(temp_ptm+"\n")
                prev_fp_pos = fp.tell()
                line = ' '.join(fp.readline().split())
                info = line.split(" ")
            temp_ptm = re.sub('(\.*)\)', ')', temp_ptm)
            for doc in ptms:
                #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)':
                #	print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n")
                if doc == re.sub('[\.|\;].*', '', temp_ptm):
                    #if "Q9TT90" in out_ac:
                    #		print("yes\n")
                    ptms.setdefault(doc, []).append(out_position)
            ptms = dict([(k, list(itertools.chain.from_iterable(v)))
                         for k, v in ptms.items() if len(v) > 0])
            fp.seek(prev_fp_pos)
        elif tag == "SQ":
            sequence = seq_read(fp)
            out_data = functions.merge_two_dicts(out_data, ptms)
            out_data['sequence'] = sequence
            table.save(out_data)
            ##rewind
            ptms = {
                'Phosphoserine': [],
                'Phosphothreonine': [],
                'Phosphotyrosine': [],
                'N6-acetyllysine': [],
                'Omega-N-methylarginine': [],
                'Dimethylated arginine': [],
                'Symmetric dimethylarginine': [],
                'Asymmetric dimethylarginine': [],
                'N6-methyllysine': [],
                'N6,N6-dimethyllysine': [],
                'N6,N6,N6-trimethyllysine': [],
                'N-linked (GlcNAc) asparagine': [],
                'S-palmitoyl cysteine': [],
                'Pyrrolidone carboxylic acid': [],
                'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in SUMO)':
                [],
                'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in ubiquitin)':
                []
            }
            out_data.clear()
            out_ac = []
            out_position = []
            sequence = ""
            check = []

        line = fp.readline()

    fp.close()
Beispiel #11
0
def blast_output(filepath,ptms,out_folder):
    """
    main function to generate display from blast output
    and write to files
    """
    file = []
    
    for ptm in ptms:
        file.append(open(out_folder+'/'+ptm+'.txt','w'))
    
    table = functions.connectMongoDB('uniprot','table')
    out_file = open(out_folder+'/blast_output.txt','w')
    
    seqs_start_position = 0
    seqs_end_position = 0
    output = dict()
    ac_deletions = dict()
    insertions = dict()
    ab_ptms = dict()
    
    fp = open(filepath)
    line = fp.readline()
    
    sequence_pad = -1
    
    
    while line:
        collapsed = ' '.join(line.split())
        data = collapsed.split(" ")
        tag = data[0]
        # 1. read blast result sequences
        if tag == 'Query_1': #blast result start
            sequence_pad += 1
            temp_q_end = int(data[3])
            q_seq = data[2]
            seqs_start_position = line.find(data[2]) # start position in txt
            seqs_end_position = line.find(data[3]) - 2 # end position in txt     
            seqs_end_index = dict() # sequence end index
            seqs_start_index = dict() # sequence start index
    
            line = fp.readline()
            collapsed = ' '.join(line.split())
            data = collapsed.split(" ")
            
            prev_ac = ""

            while line and data[0] != "Lambda":
                if data[0] == 'Query_1': # if its query
                    sequence_pad += 1
                    seqs_start_position = line.find(data[2])
                    seqs_end_position = line.find(data[3]) - 2
                    if temp_q_end == int(data[1])-1:
                        temp_q_end = int(data[3])
                        q_seq += data[2]
                    else:
                        print("special case!")
                elif len(data) == 4 and is_int(data[1]) and is_int(data[3]): # if its subjects
                    if data[0] in output: # if its not head
                        prev_ac = data[0]
                        seqs_end_index[data[0]] = int(data[3])
                        output[data[0]] += line[seqs_start_position:seqs_end_position]
                    else: # if its head
                        prev_ac = data[0]
                        seqs_start_index[data[0]] = int(data[1])
                        seqs_end_index[data[0]] = int(data[3])
                        output[data[0]] = reposition_seq(line[seqs_start_position:seqs_end_position],sequence_pad)
                elif data[0] == '\\':
                    if prev_ac in ac_deletions:
                        delete = get_deletions(fp,sequence_pad)
                        ac_deletions[prev_ac] += delete
                    else:
                        delete = get_deletions(fp,sequence_pad)
    
                        ac_deletions[prev_ac] = delete
                line = fp.readline()
                
                collapsed = ' '.join(line.split())
                data = collapsed.split(" ")	
            
            ##########check ptms
            #for id in ids:
            #	temp = table.find_one({"_id": id})
            #	for ptm in ptms:
            #		if ptm in temp:
            #			print(temp[ptm])
            ##################
        
            # preprocess data

            for id in output:
                output[id] = output[id].ljust(len(q_seq))
                output[id] = output[id].replace(" ",".")
                insertions[id] = get_inserts(output[id])
                dict((k, v) for k, v in insertions.iteritems() if v)
    
            #############check deletions
            #for i in ac_deletions:
            #	for j in ac_deletions[i]:
            #		print("id: "+i+"\tpos: "+str(j.pos)+"\tseq: "+j.seq)
            #####################
    
            # ptm position is relative to the line in the file not sequence now
            for counter, ptm in enumerate(ptms):
                # generate the ptm position for display
                ab_ptms = get_ptms(ptm,table,seqs_start_index,seqs_end_index,insertions,ac_deletions,output) #TODO check if ids are right
                display_ptm(ab_ptms,file[counter],output) #TODO one more ids
            identities = get_identities(out_folder)
            display_output(q_seq,output,identities,out_file) #TODO ids here
    
        line = fp.readline()
    
    
    out_file.close()
    fp.close()
    for index, ptm in enumerate(ptms):
        file[index].close()