Python extract_from_gene2ref Examples, Modules.f05_IDConvert.extract_from_gene2ref Python Examples

Example #1

0

Show file

File: IdMappingModule.py Project: shl198/Projects

def name2geneMap(organism1, taxID1, organism2, taxID2, gene_info):
    """
    This function maps gene ids between two species by full name.
    
    * organism1: string. the 1st organism. eg: 'cho'
    
    * taxID1: taxonomy ID. eg: 10029
    
    * organism2: string. the 2nd organism. eg: 'human'
    
    * taxID2: taxonomy ID. eg: 9606
    
    return: [cho2human.txt, cho_gene_info.txt]. For 'cho2human.txt' file, ID mapping based on gene names. 
    1st column is unique geIDs, 2nd column is all of the genes mapping to 
    """
    org1_lib = {}
    org2_lib = {}
    gene1 = []
    gene2 = []
    org1_file = extract_from_gene2ref(gene_info,
                                      taxID1,
                                      organism1,
                                      columnNum=[2, 9])
    org2_file = extract_from_gene2ref(gene_info,
                                      taxID2,
                                      organism2,
                                      columnNum=[2, 9])
    outputFile = org1_file[:-3] + organism1 + '2' + organism2 + '.txt'
    # build libray for organism1
    result = open(org1_file, 'r')
    for line in result:
        item = line[:-1].split('\t')
        # if gene name has many ids
        if item[1] in org1_lib:
            org1_lib[item[1]].append(item[0])
        else:
            org1_lib[item[1]] = [item[0]]
        gene1.append(item[1])
    # build library for organism2
    result = open(org2_file, 'r')
    for line in result:
        item = line[:-1].split('\t')
        if item[1] in org2_lib:
            org2_lib[item[1]].append(item[0])
        else:
            org2_lib[item[1]] = [item[0]]
        gene2.append(item[1])
    # find intersect gene name
    intersectName = set(gene1).intersection(gene2)
    # from name to gene id, and output
    output = open(outputFile, 'w')
    for name in intersectName:
        geneID1 = org1_lib[name]
        geneID2 = org2_lib[name]
        # geneID1 and geneID2 are lists and can have many items in side it.
        for fir in geneID1:
            output.write(fir + '\t' + ';'.join(geneID2) + '\n')
    output.close()
    return [outputFile, org1_file]

Example #2

0

Show file

File: IdMappingModule.py Project: shl198/Pipeline

def name2geneMap(organism1,taxID1,organism2,taxID2,gene_info):
    """
    This function maps gene ids between two species by full name.
    
    * organism1: string. the 1st organism. eg: 'cho'
    
    * taxID1: taxonomy ID. eg: 10029
    
    * organism2: string. the 2nd organism. eg: 'human'
    
    * taxID2: taxonomy ID. eg: 9606
    
    return: [cho2human.txt, cho_gene_info.txt]. For 'cho2human.txt' file, ID mapping based on gene names. 
    1st column is unique geIDs, 2nd column is all of the genes mapping to 
    """
    org1_lib = {}; org2_lib = {}
    gene1 = []; gene2=[]
    org1_file = extract_from_gene2ref(gene_info,taxID1,organism1,columnNum=[2,9])
    org2_file = extract_from_gene2ref(gene_info,taxID2,organism2,columnNum=[2,9])
    outputFile = org1_file[:-3] + organism1 + '2' + organism2 + '.txt'
    # build libray for organism1
    result = open(org1_file,'r')
    for line in result:
        item = line[:-1].split('\t')
        # if gene name has many ids
        if item[1] in org1_lib:
            org1_lib[item[1]].append(item[0])
        else:
            org1_lib[item[1]] = [item[0]]
        gene1.append(item[1])
    # build library for organism2
    result = open(org2_file,'r')
    for line in result:
        item = line[:-1].split('\t')
        if item[1] in org2_lib:
            org2_lib[item[1]].append(item[0])
        else:
            org2_lib[item[1]] = [item[0]]
        gene2.append(item[1])
    # find intersect gene name
    intersectName = set(gene1).intersection(gene2)
    # from name to gene id, and output
    output = open(outputFile,'w')
    for name in intersectName:
        geneID1 = org1_lib[name]
        geneID2 = org2_lib[name]
        # geneID1 and geneID2 are lists and can have many items in side it.
        for fir in geneID1:
            output.write(fir + '\t' + ';'.join(geneID2) + '\n')
    output.close()
    return [outputFile,org1_file]

Example #3

0

Show file

File: IdMappingModule.py Project: shl198/Pipeline

def blastp2geneMap(blastFiles,organism1,taxID1,organism2,taxID2,gene2refseq,topNum,IDtype='protein'):
    """
    This function transfers blastp results into geneID mapping results
    
    * blastFiles: a list of 2  way blastp tabular result files. eg: [blast1.txt,blast2.txt]
    
    * organism1: string. the 1st organism. eg: 'cho'
    
    * organism2: string. the 2nd organism. eg: 'human'
    
    * gene2refseq: filename. eg: 'gene2refseq'
    
    returns a list of four files. For first 2: each with two columns of gene ID mapping.
    For thre rest 2: gene2refseq files for each organism
    """
    # extract gene, accession, protein mapping
    if IDtype == 'protein':
        columnNum = [2,6,7,16]
    else:
        columnNum = [2,4,5,16]
    org1ref = extract_from_gene2ref(gene2refseq,taxID1,organism1,columnNum)
    org2ref = extract_from_gene2ref(gene2refseq,taxID2,organism2,columnNum)
    result = []
    switches = ['False','True']
    for blast,switch in zip(blastFiles,switches):
        # extract protein id map
        pr_id_map = extract_blast_ID_map(blast,topNum,switch)  # pr_id_map: cho2human.top1.txt
        # protein id mapping to gene id mapping
        gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map,org1ref,org2ref,IDtype) # cho2human.top1.gene.txt
        # get unique line of mapping
        uniqFile = gene_id_map[:-3] + 'uniqline.txt'   # cho2human.top1.gene.uniqline.txt
        interFile = gene_id_map[:-3] + 'inter.txt'
        # # # get unique id mappings (each line is unique, but genes are not unique)
        if switch == 'True':
            cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format(
                                      OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile)
            cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format(input=interFile,output=uniqFile)
        else:
            cmd1 = ('awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}').format(
                                      OFS='{FS=\"\\t\"; OFS=FS}',printrow='{print $1,$2}',input=gene_id_map,output=interFile)
            cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format(input=interFile,output=uniqFile)
            
        subprocess.call(cmd1,shell=True)
        subprocess.call(cmd2,shell=True)
        
        subprocess.call(('rm {inter}').format(inter=interFile),shell=True)
        # unique gene ID in 1st column
        uniq = uniq1stGene(uniqFile) # cho2human.top1.gene.uniqline.uniq1stgene.txt
        result.append(uniq)
    result.extend([org1ref,org2ref])
    return result

Example #4

0

Show file

File: IdMappingModule.py Project: shl198/Projects

def blastp2geneMap(blastFiles,
                   organism1,
                   taxID1,
                   organism2,
                   taxID2,
                   gene2refseq,
                   topNum,
                   IDtype='protein'):
    """
    This function transfers blastp results into geneID mapping results
    
    * blastFiles: a list of 2  way blastp tabular result files. eg: [blast1.txt,blast2.txt]
    
    * organism1: string. the 1st organism. eg: 'cho'
    
    * organism2: string. the 2nd organism. eg: 'human'
    
    * gene2refseq: filename. eg: 'gene2refseq'
    
    returns a list of four files. For first 2: each with two columns of gene ID mapping.
    For thre rest 2: gene2refseq files for each organism
    """
    # extract gene, accession, protein mapping
    if IDtype == 'protein':
        columnNum = [2, 6, 7, 16]
    else:
        columnNum = [2, 4, 5, 16]
    org1ref = extract_from_gene2ref(gene2refseq, taxID1, organism1, columnNum)
    org2ref = extract_from_gene2ref(gene2refseq, taxID2, organism2, columnNum)
    result = []
    switches = ['False', 'True']
    for blast, switch in zip(blastFiles, switches):
        # extract protein id map
        pr_id_map = extract_blast_ID_map(
            blast, topNum, switch)  # pr_id_map: cho2human.top1.txt
        # protein id mapping to gene id mapping
        gene_id_map = mRNA_prIDMap2geneIDMap(pr_id_map, org1ref, org2ref,
                                             IDtype)  # cho2human.top1.gene.txt
        # get unique line of mapping
        uniqFile = gene_id_map[:-3] + 'uniqline.txt'  # cho2human.top1.gene.uniqline.txt
        interFile = gene_id_map[:-3] + 'inter.txt'
        # # # get unique id mappings (each line is unique, but genes are not unique)
        if switch == 'True':
            cmd1 = (
                'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}'
            ).format(OFS='{FS=\"\\t\"; OFS=FS}',
                     printrow='{print $1,$2}',
                     input=gene_id_map,
                     output=interFile)
            cmd2 = ('sort -k2,2 -k1,1 {input} | uniq > {output}').format(
                input=interFile, output=uniqFile)
        else:
            cmd1 = (
                'awk -F $\'\\t\' \'BEGIN {OFS} {printrow}\' {input} > {output}'
            ).format(OFS='{FS=\"\\t\"; OFS=FS}',
                     printrow='{print $1,$2}',
                     input=gene_id_map,
                     output=interFile)
            cmd2 = ('sort -k1,1 -k2,2 {input} | uniq > {output}').format(
                input=interFile, output=uniqFile)

        subprocess.call(cmd1, shell=True)
        subprocess.call(cmd2, shell=True)

        subprocess.call(('rm {inter}').format(inter=interFile), shell=True)
        # unique gene ID in 1st column
        uniq = uniq1stGene(
            uniqFile)  # cho2human.top1.gene.uniqline.uniq1stgene.txt
        result.append(uniq)
    result.extend([org1ref, org2ref])
    return result

Example #5

0

Show file

def rosetta_stone(blastFile,organism1,taxID1,organism2,taxID2,gene2refseq):
    """
    This functions detects rosetta stone pairs in the blast result.
    
    * blastFile: fileanme of tabular blast output.
    """
    # # (1) change first 2 columns into ID, extract top hit
    IDblast = blastName2ID(blastFile)
    # # (2) extract cho, human from gene2refseq
    org1ref = extract_from_gene2ref(gene2refseq,taxID1,organism1,columnNum=[2,6,7])
    org2ref = extract_from_gene2ref(gene2refseq,taxID2,organism2,columnNum=[2,6,7])
    # # (3) replace protein ID with gene ID
    IDmap = prIDMap2geneIDMap(IDblast,org1ref,org2ref)
    # # (4) sort file by 2nd column
    sortFile = IDmap[:-3] + 'sort.txt'
    cmd = ('sort -k2,2 -n {input} > {output}').format(input=IDmap,output=sortFile)
    subprocess.call(cmd,shell=True)
    os.remove(IDblast)
    
    # # (5) now come to the parsing rosetta stone pairs
    filename = sortFile
    res = open(filename,'r')
    inter = 'inter.txt'
    interOut = open(inter,'w')
    outputFile = filename[:-16] + 'rosetta.txt'
    output = open(outputFile,'w') 
    # combine lines with same 2 columns.
    id_pair = res.readline()[:-1].split('\t')
    # start from second line
    for line in res:
        line = line[:-1]
        item = line.split('\t')
        # should merge
        if id_pair[0] == item[0] and id_pair[1] == item[1]:
            id_pair[8] = str(min(int(id_pair[8]),int(id_pair[9]),int(item[8]),int(item[9])))
            id_pair[9] = str(max(int(id_pair[8]),int(id_pair[9]),int(item[8]),int(item[9])))
        else:
            interOut.write('\t'.join(id_pair) + '\n')
            id_pair = item
    interOut.write('\t'.join(id_pair) + '\n')
    interOut.close()
    res.close()
    # now interOut has unique first 2 columns of blast tabular results
    res = open(inter,'r')
    group = [res.readline()[:-1].split('\t')] # group stores lines with same reference
    for line in res:
        item = line[:-1].split('\t')
        if item[1] == group[-1][1]:
            group.append(item)
        else:
            # whether to do rosetta pair detection
            if len(group) > 1:
                rosetta = rosetta_pair(group)
                # output to file
                if rosetta != []:
                    for pair in rosetta:
                        output.write('\t'.join(pair) + '\n')
            # there is no rosetta pairs
            group = [item]
    if len(group) > 1:
                rosetta = rosetta_pair(group)
                # output to file
                if rosetta != []:
                    for pair in rosetta:
                        output.write('\t'.join(pair) + '\n')
                        
    output.close()
    res.close()
    return outputFile