def blastp2geneMap(blastFiles,organism1,taxID1,organism2,taxID2,gene2refseq,topNum,IDtype='protein'): """ This function transfers blastp results into geneID mapping results * blastFiles: a list of 2 way blastp tabular result files. eg: [blast1.txt,blast2.txt] * organism1: string. the 1st organism. eg: 'cho' * organism2: string. the 2nd organism. eg: 'human' * gene2refseq: filename. eg: 'gene2refseq' returns a list of four files. For first 2: each with two columns of gene ID mapping. For thre rest 2: gene2refseq files for each organism """ # extract gene, accession, protein mapping if IDtype == 'protein': columnNum = [2,6,7,16] else: columnNum = [2,4,5,16] org1ref = extract_from_gene2ref(gene2refseq,taxID1,organism1,columnNum) org2ref = extract_from_gene2ref(gene2refseq,taxID2,organism2,columnNum) result = [] switches = ['False','True'] for blast,switch in zip(blastFiles,switches): # extract protein id map pr_id_map = extract_blast_ID_map(blast,topNum,switch) # pr_id_map: cho2human.top1.txt # protein id mapping to gene id mapping gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map,org1ref,org2ref,IDtype) # cho2human.top1.gene.txt # get unique line of mapping uniqFile = gene_id_map[:-3] + 'uniqline.txt' # cho2human.top1.gene.uniqline.txt interFile = gene_id_map[:-3] + 'inter.txt' # # # get unique id mappings (each line is unique, but genes are not unique) if switch == 'True': cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format( OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile) cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format(input=interFile,output=uniqFile) else: cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format( OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile) cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format(input=interFile,output=uniqFile) subprocess.call(cmd1,shell=True) subprocess.call(cmd2,shell=True) subprocess.call(('rm {inter}').format(inter=interFile),shell=True) # unique gene ID in 1st column uniq = uniq1stGene(uniqFile) # cho2human.top1.gene.uniqline.uniq1stgene.txt result.append(uniq) result.extend([org1ref,org2ref]) return result
def blastp2geneMap(blastFiles, organism1, taxID1, organism2, taxID2, gene2refseq, topNum, IDtype='protein'): """ This function transfers blastp results into geneID mapping results * blastFiles: a list of 2 way blastp tabular result files. eg: [blast1.txt,blast2.txt] * organism1: string. the 1st organism. eg: 'cho' * organism2: string. the 2nd organism. eg: 'human' * gene2refseq: filename. eg: 'gene2refseq' returns a list of four files. For first 2: each with two columns of gene ID mapping. For thre rest 2: gene2refseq files for each organism """ # extract gene, accession, protein mapping if IDtype == 'protein': columnNum = [2, 6, 7, 16] else: columnNum = [2, 4, 5, 16] org1ref = extract_from_gene2ref(gene2refseq, taxID1, organism1, columnNum) org2ref = extract_from_gene2ref(gene2refseq, taxID2, organism2, columnNum) result = [] switches = ['False', 'True'] for blast, switch in zip(blastFiles, switches): # extract protein id map pr_id_map = extract_blast_ID_map( blast, topNum, switch) # pr_id_map: cho2human.top1.txt # protein id mapping to gene id mapping gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map, org1ref, org2ref, IDtype) # cho2human.top1.gene.txt # get unique line of mapping uniqFile = gene_id_map[:-3] + 'uniqline.txt' # cho2human.top1.gene.uniqline.txt interFile = gene_id_map[:-3] + 'inter.txt' # # # get unique id mappings (each line is unique, but genes are not unique) if switch == 'True': cmd1 = ( 'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}' ).format(OFS='{FS=\"\\t\"; OFS=FS}', printrow='{print $1,$2}', input=gene_id_map, output=interFile) cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format( input=interFile, output=uniqFile) else: cmd1 = ( 'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}' ).format(OFS='{FS=\"\\t\"; OFS=FS}', printrow='{print $1,$2}', input=gene_id_map, output=interFile) cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format( input=interFile, output=uniqFile) subprocess.call(cmd1, shell=True) subprocess.call(cmd2, shell=True) subprocess.call(('rm {inter}').format(inter=interFile), shell=True) # unique gene ID in 1st column uniq = uniq1stGene( uniqFile) # cho2human.top1.gene.uniqline.uniq1stgene.txt result.append(uniq) result.extend([org1ref, org2ref]) return result
def DB4unOverlap(unmapGeneIDs, org1ref, org2ref, blastFiles, topPrNum, topGeneNum): """ This function tries to find why the 2wayblastP result don't have overlapped mapping ids. It builds a database file that has all gene ids mappings from both sides and then we can check whethe there are some overlapps * unmapGeneIDs: filename. gene ids that don't have overlapped mapping results. * or """ # -------------- 1. get the protein ids of unoverlapped gene ids --------------- # org1ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.cho.txt' # org2ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.mouse.txt' # # -------------- 2. get top 5 protein mappings --------------------------------- # blastFiles = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.txt','/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.txt'] indexFile = [] switches = ['False', 'True'] for blast, switch in zip(blastFiles, switches): pr_id_map = extract_blast_ID_map( blast, topPrNum, switch) # pr_id_map: cho2human.top5.txt # protein id mapping to gene id mapping gene_id_map = mRNA_prIDMap2geneIDMap( pr_id_map, org1ref, org2ref, switch, IDtype='protein') # cho2human.top5.gene.txt # delete the consecutive repeated lines, for lines repeated but are seperated by other lines, they will retain uniq_id_map = gene_id_map[:-3] + 'uniq.txt' # cho2human.top5.gene.uniq.txt cmd = ('rev {input} | uniq -f 2 | rev > {output}').format( input=gene_id_map, output=uniq_id_map) subprocess.call(cmd, shell=True) # index the id mapping, this is for one id in org1 has multiple ids in org2 mapped to index_map = indexUniqline(uniq_id_map, switch) # cho2human.top5.gene.uniq.index.txt # sort index file sort_map = index_map[:-3] + 'sort.txt' # cho2human.top5.gene.uniq.index.sort.txt if switch == 'False': # sort based on 1st column, then on 2nd column cmd = ('sort -k1,1n -k2,2n {input} > {output}').format( input=index_map, output=sort_map) else: cmd = ('sort -k2,2n -k1,1n {input} > {output}').format( input=index_map, output=sort_map) subprocess.call(cmd, shell=True) #sort_map = '/data/shangzhong/CHO2Human/2wayBlastPresult/top5/cho2human.top250.gene.uniq.index.sort.txt' # get uniq first two ids. uniqline_map = uniqFirst2Col( sort_map) # cho2human.top5.gene.uniq.index.sort.uniqline.txt # sort by 1rs and 3rd columns for cho2human, and 2nd and 3rd columns for human2cho sortbyindex_map = uniqline_map[:-3] + 'index.txt' # cho2human.top5.gene.uniq.index.sort.uniqline.index.txt if switch == 'False': cmd = ('sort -k1,1n -k5,5n {input} > {output}').format( input=uniqline_map, output=sortbyindex_map) else: cmd = ('sort -k2,2n -k5,5n {input} > {output}').format( input=uniqline_map, output=sortbyindex_map) subprocess.call(cmd, shell=True) final_index = indexUniqline( sortbyindex_map, switch ) # cho2human.top5.gene.uniq.index.sort.uniqline.index.index.txt indexFile.append(final_index) return indexFile # -------------- 3. get unoverlapped proteins into a list --------------- # get unonverlapped ids unmapGeneIDs = '/data/shangzhong/CHO2Mouse/finalresult/CHO2Mouse_nonOverlap.txt' geneIds = {} with open(unmapGeneIDs, 'r') as inputfile: for line in inputfile: geneIds[line[:-1]] = [[] for i in range(4)] # -------------- 4. merge two index files into one --------------------------------- # indexFile = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.top250.gene.uniq.index.sort.uniqline.index.index.txt', # '/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.top250.gene.uniq.index.sort.uniqline.index.index.txt'] topGeneNum = 5 res = open(indexFile[0], 'r') for line in res: item = line[:-1].split('\t') if item[0] in geneIds: if int(item[4]) > topGeneNum: continue else: geneIds[item[0]][1].append(item[1]) geneIds[item[0]][0].append(item[4]) else: continue res = open(indexFile[1], 'r') for line in res: item = line[:-1].split('\t') if item[0] in geneIds: if int(item[4]) > topGeneNum: continue else: geneIds[item[0]][2].append(item[1]) geneIds[item[0]][3].append(item[4]) else: continue outputfile = indexFile[0][:-22] + 'nonmap.txt' # cho2human.top5.nonmap.txt output = open(outputfile, 'w') for key in geneIds: outline = ( '{key}\t{cho2human}\t{human2cho}\n-\t{cho2humanIndex}\t{human2choIndex}\n' ).format(key=key, cho2human=','.join(geneIds[key][1]), human2cho=','.join(geneIds[key][2]), human2choIndex=','.join(geneIds[key][3]), cho2humanIndex=','.join(geneIds[key][0])) output.write(outline) output.close() print 'done'
def DB4unOverlap(unmapGeneIDs,org1ref,org2ref,blastFiles,topPrNum,topGeneNum): """ This function tries to find why the 2wayblastP result don't have overlapped mapping ids. It builds a database file that has all gene ids mappings from both sides and then we can check whethe there are some overlapps * unmapGeneIDs: filename. gene ids that don't have overlapped mapping results. * or """ # -------------- 1. get the protein ids of unoverlapped gene ids --------------- # org1ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.cho.txt' # org2ref = '/data/shangzhong/CHO2Mouse/2wayBlastPresult/141026gene2refseq.mouse.txt' # # -------------- 2. get top 5 protein mappings --------------------------------- # blastFiles = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.txt','/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.txt'] indexFile = [] switches = ['False','True'] for blast,switch in zip(blastFiles,switches): pr_id_map = extract_blast_ID_map(blast,topPrNum,switch) # pr_id_map: cho2human.top5.txt # protein id mapping to gene id mapping gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map,org1ref,org2ref,switch,IDtype='protein') # cho2human.top5.gene.txt # delete the consecutive repeated lines, for lines repeated but are seperated by other lines, they will retain uniq_id_map = gene_id_map[:-3] + 'uniq.txt' # cho2human.top5.gene.uniq.txt cmd = ('rev {input} | uniq -f 2 | rev > {output}').format(input=gene_id_map,output=uniq_id_map) subprocess.call(cmd,shell=True) # index the id mapping, this is for one id in org1 has multiple ids in org2 mapped to index_map = indexUniqline(uniq_id_map,switch) # cho2human.top5.gene.uniq.index.txt # sort index file sort_map = index_map[:-3] + 'sort.txt' # cho2human.top5.gene.uniq.index.sort.txt if switch == 'False': # sort based on 1st column, then on 2nd column cmd = ('sort -k1,1n -k2,2n {input} > {output}').format(input=index_map,output=sort_map) else: cmd = ('sort -k2,2n -k1,1n {input} > {output}').format(input=index_map,output=sort_map) subprocess.call(cmd,shell=True) #sort_map = '/data/shangzhong/CHO2Human/2wayBlastPresult/top5/cho2human.top250.gene.uniq.index.sort.txt' # get uniq first two ids. uniqline_map = uniqFirst2Col(sort_map) # cho2human.top5.gene.uniq.index.sort.uniqline.txt # sort by 1rs and 3rd columns for cho2human, and 2nd and 3rd columns for human2cho sortbyindex_map = uniqline_map[:-3] + 'index.txt' # cho2human.top5.gene.uniq.index.sort.uniqline.index.txt if switch == 'False': cmd = ('sort -k1,1n -k5,5n {input} > {output}').format(input=uniqline_map,output=sortbyindex_map) else: cmd = ('sort -k2,2n -k5,5n {input} > {output}').format(input=uniqline_map,output=sortbyindex_map) subprocess.call(cmd,shell=True) final_index = indexUniqline(sortbyindex_map,switch) # cho2human.top5.gene.uniq.index.sort.uniqline.index.index.txt indexFile.append(final_index) return indexFile # -------------- 3. get unoverlapped proteins into a list --------------- # get unonverlapped ids unmapGeneIDs = '/data/shangzhong/CHO2Mouse/finalresult/CHO2Mouse_nonOverlap.txt' geneIds = {} with open(unmapGeneIDs,'r') as inputfile: for line in inputfile: geneIds[line[:-1]] = [[] for i in range(4)] # -------------- 4. merge two index files into one --------------------------------- # indexFile = ['/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/cho2mouse.top250.gene.uniq.index.sort.uniqline.index.index.txt', # '/data/shangzhong/CHO2Mouse/2wayBlastPresult/all/mouse2cho.top250.gene.uniq.index.sort.uniqline.index.index.txt'] topGeneNum = 5 res = open(indexFile[0],'r') for line in res: item = line[:-1].split('\t') if item[0] in geneIds: if int(item[4]) > topGeneNum: continue else: geneIds[item[0]][1].append(item[1]) geneIds[item[0]][0].append(item[4]) else: continue res = open(indexFile[1],'r') for line in res: item = line[:-1].split('\t') if item[0] in geneIds: if int(item[4]) > topGeneNum: continue else: geneIds[item[0]][2].append(item[1]) geneIds[item[0]][3].append(item[4]) else: continue outputfile = indexFile[0][:-22] + 'nonmap.txt' # cho2human.top5.nonmap.txt output = open(outputfile,'w') for key in geneIds: outline = ('{key}\t{cho2human}\t{human2cho}\n-\t{cho2humanIndex}\t{human2choIndex}\n').format(key=key, cho2human=','.join(geneIds[key][1]),human2cho=','.join(geneIds[key][2]), human2choIndex=','.join(geneIds[key][3]),cho2humanIndex=','.join(geneIds[key][0])) output.write(outline) output.close() print 'done'