def name2geneMap(organism1, taxID1, organism2, taxID2, gene_info): """ This function maps gene ids between two species by full name. * organism1: string. the 1st organism. eg: 'cho' * taxID1: taxonomy ID. eg: 10029 * organism2: string. the 2nd organism. eg: 'human' * taxID2: taxonomy ID. eg: 9606 return: [cho2human.txt, cho_gene_info.txt]. For 'cho2human.txt' file, ID mapping based on gene names. 1st column is unique geIDs, 2nd column is all of the genes mapping to """ org1_lib = {} org2_lib = {} gene1 = [] gene2 = [] org1_file = extract_from_gene2ref(gene_info, taxID1, organism1, columnNum=[2, 9]) org2_file = extract_from_gene2ref(gene_info, taxID2, organism2, columnNum=[2, 9]) outputFile = org1_file[:-3] + organism1 + '2' + organism2 + '.txt' # build libray for organism1 result = open(org1_file, 'r') for line in result: item = line[:-1].split('\t') # if gene name has many ids if item[1] in org1_lib: org1_lib[item[1]].append(item[0]) else: org1_lib[item[1]] = [item[0]] gene1.append(item[1]) # build library for organism2 result = open(org2_file, 'r') for line in result: item = line[:-1].split('\t') if item[1] in org2_lib: org2_lib[item[1]].append(item[0]) else: org2_lib[item[1]] = [item[0]] gene2.append(item[1]) # find intersect gene name intersectName = set(gene1).intersection(gene2) # from name to gene id, and output output = open(outputFile, 'w') for name in intersectName: geneID1 = org1_lib[name] geneID2 = org2_lib[name] # geneID1 and geneID2 are lists and can have many items in side it. for fir in geneID1: output.write(fir + '\t' + ';'.join(geneID2) + '\n') output.close() return [outputFile, org1_file]
def name2geneMap(organism1,taxID1,organism2,taxID2,gene_info): """ This function maps gene ids between two species by full name. * organism1: string. the 1st organism. eg: 'cho' * taxID1: taxonomy ID. eg: 10029 * organism2: string. the 2nd organism. eg: 'human' * taxID2: taxonomy ID. eg: 9606 return: [cho2human.txt, cho_gene_info.txt]. For 'cho2human.txt' file, ID mapping based on gene names. 1st column is unique geIDs, 2nd column is all of the genes mapping to """ org1_lib = {}; org2_lib = {} gene1 = []; gene2=[] org1_file = extract_from_gene2ref(gene_info,taxID1,organism1,columnNum=[2,9]) org2_file = extract_from_gene2ref(gene_info,taxID2,organism2,columnNum=[2,9]) outputFile = org1_file[:-3] + organism1 + '2' + organism2 + '.txt' # build libray for organism1 result = open(org1_file,'r') for line in result: item = line[:-1].split('\t') # if gene name has many ids if item[1] in org1_lib: org1_lib[item[1]].append(item[0]) else: org1_lib[item[1]] = [item[0]] gene1.append(item[1]) # build library for organism2 result = open(org2_file,'r') for line in result: item = line[:-1].split('\t') if item[1] in org2_lib: org2_lib[item[1]].append(item[0]) else: org2_lib[item[1]] = [item[0]] gene2.append(item[1]) # find intersect gene name intersectName = set(gene1).intersection(gene2) # from name to gene id, and output output = open(outputFile,'w') for name in intersectName: geneID1 = org1_lib[name] geneID2 = org2_lib[name] # geneID1 and geneID2 are lists and can have many items in side it. for fir in geneID1: output.write(fir + '\t' + ';'.join(geneID2) + '\n') output.close() return [outputFile,org1_file]
def blastp2geneMap(blastFiles,organism1,taxID1,organism2,taxID2,gene2refseq,topNum,IDtype='protein'): """ This function transfers blastp results into geneID mapping results * blastFiles: a list of 2 way blastp tabular result files. eg: [blast1.txt,blast2.txt] * organism1: string. the 1st organism. eg: 'cho' * organism2: string. the 2nd organism. eg: 'human' * gene2refseq: filename. eg: 'gene2refseq' returns a list of four files. For first 2: each with two columns of gene ID mapping. For thre rest 2: gene2refseq files for each organism """ # extract gene, accession, protein mapping if IDtype == 'protein': columnNum = [2,6,7,16] else: columnNum = [2,4,5,16] org1ref = extract_from_gene2ref(gene2refseq,taxID1,organism1,columnNum) org2ref = extract_from_gene2ref(gene2refseq,taxID2,organism2,columnNum) result = [] switches = ['False','True'] for blast,switch in zip(blastFiles,switches): # extract protein id map pr_id_map = extract_blast_ID_map(blast,topNum,switch) # pr_id_map: cho2human.top1.txt # protein id mapping to gene id mapping gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map,org1ref,org2ref,IDtype) # cho2human.top1.gene.txt # get unique line of mapping uniqFile = gene_id_map[:-3] + 'uniqline.txt' # cho2human.top1.gene.uniqline.txt interFile = gene_id_map[:-3] + 'inter.txt' # # # get unique id mappings (each line is unique, but genes are not unique) if switch == 'True': cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format( OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile) cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format(input=interFile,output=uniqFile) else: cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format( OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile) cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format(input=interFile,output=uniqFile) subprocess.call(cmd1,shell=True) subprocess.call(cmd2,shell=True) subprocess.call(('rm {inter}').format(inter=interFile),shell=True) # unique gene ID in 1st column uniq = uniq1stGene(uniqFile) # cho2human.top1.gene.uniqline.uniq1stgene.txt result.append(uniq) result.extend([org1ref,org2ref]) return result
def blastp2geneMap(blastFiles, organism1, taxID1, organism2, taxID2, gene2refseq, topNum, IDtype='protein'): """ This function transfers blastp results into geneID mapping results * blastFiles: a list of 2 way blastp tabular result files. eg: [blast1.txt,blast2.txt] * organism1: string. the 1st organism. eg: 'cho' * organism2: string. the 2nd organism. eg: 'human' * gene2refseq: filename. eg: 'gene2refseq' returns a list of four files. For first 2: each with two columns of gene ID mapping. For thre rest 2: gene2refseq files for each organism """ # extract gene, accession, protein mapping if IDtype == 'protein': columnNum = [2, 6, 7, 16] else: columnNum = [2, 4, 5, 16] org1ref = extract_from_gene2ref(gene2refseq, taxID1, organism1, columnNum) org2ref = extract_from_gene2ref(gene2refseq, taxID2, organism2, columnNum) result = [] switches = ['False', 'True'] for blast, switch in zip(blastFiles, switches): # extract protein id map pr_id_map = extract_blast_ID_map( blast, topNum, switch) # pr_id_map: cho2human.top1.txt # protein id mapping to gene id mapping gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map, org1ref, org2ref, IDtype) # cho2human.top1.gene.txt # get unique line of mapping uniqFile = gene_id_map[:-3] + 'uniqline.txt' # cho2human.top1.gene.uniqline.txt interFile = gene_id_map[:-3] + 'inter.txt' # # # get unique id mappings (each line is unique, but genes are not unique) if switch == 'True': cmd1 = ( 'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}' ).format(OFS='{FS=\"\\t\"; OFS=FS}', printrow='{print $1,$2}', input=gene_id_map, output=interFile) cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format( input=interFile, output=uniqFile) else: cmd1 = ( 'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}' ).format(OFS='{FS=\"\\t\"; OFS=FS}', printrow='{print $1,$2}', input=gene_id_map, output=interFile) cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format( input=interFile, output=uniqFile) subprocess.call(cmd1, shell=True) subprocess.call(cmd2, shell=True) subprocess.call(('rm {inter}').format(inter=interFile), shell=True) # unique gene ID in 1st column uniq = uniq1stGene( uniqFile) # cho2human.top1.gene.uniqline.uniq1stgene.txt result.append(uniq) result.extend([org1ref, org2ref]) return result
def rosetta_stone(blastFile,organism1,taxID1,organism2,taxID2,gene2refseq): """ This functions detects rosetta stone pairs in the blast result. * blastFile: fileanme of tabular blast output. """ # # (1) change first 2 columns into ID, extract top hit IDblast = blastName2ID(blastFile) # # (2) extract cho, human from gene2refseq org1ref = extract_from_gene2ref(gene2refseq,taxID1,organism1,columnNum=[2,6,7]) org2ref = extract_from_gene2ref(gene2refseq,taxID2,organism2,columnNum=[2,6,7]) # # (3) replace protein ID with gene ID IDmap = prIDMap2geneIDMap(IDblast,org1ref,org2ref) # # (4) sort file by 2nd column sortFile = IDmap[:-3] + 'sort.txt' cmd = ('sort -k2,2 -n {input} > {output}').format(input=IDmap,output=sortFile) subprocess.call(cmd,shell=True) os.remove(IDblast) # # (5) now come to the parsing rosetta stone pairs filename = sortFile res = open(filename,'r') inter = 'inter.txt' interOut = open(inter,'w') outputFile = filename[:-16] + 'rosetta.txt' output = open(outputFile,'w') # combine lines with same 2 columns. id_pair = res.readline()[:-1].split('\t') # start from second line for line in res: line = line[:-1] item = line.split('\t') # should merge if id_pair[0] == item[0] and id_pair[1] == item[1]: id_pair[8] = str(min(int(id_pair[8]),int(id_pair[9]),int(item[8]),int(item[9]))) id_pair[9] = str(max(int(id_pair[8]),int(id_pair[9]),int(item[8]),int(item[9]))) else: interOut.write('\t'.join(id_pair) + '\n') id_pair = item interOut.write('\t'.join(id_pair) + '\n') interOut.close() res.close() # now interOut has unique first 2 columns of blast tabular results res = open(inter,'r') group = [res.readline()[:-1].split('\t')] # group stores lines with same reference for line in res: item = line[:-1].split('\t') if item[1] == group[-1][1]: group.append(item) else: # whether to do rosetta pair detection if len(group) > 1: rosetta = rosetta_pair(group) # output to file if rosetta != []: for pair in rosetta: output.write('\t'.join(pair) + '\n') # there is no rosetta pairs group = [item] if len(group) > 1: rosetta = rosetta_pair(group) # output to file if rosetta != []: for pair in rosetta: output.write('\t'.join(pair) + '\n') output.close() res.close() return outputFile