Ejemplo n.º 1
0
def main ():
    referenced_species = "Homo_sapiens"
    # 'ENSP00000253108', 'Ailuropoda_melanoleuca', 'ensembl'
    protein_list_raw = FileUtilities.get_protein_list()
    protein_list = []
    for protein_tuple in protein_list_raw:
        protein_list.append(protein_tuple[0])
    
    fill_all_containers(False)
    
    if(len(sys.argv) < 1):
        print "Usage: {0} <blastn | tblastn | SW_gene | SW_exon | all> \n".format(sys.argv[0])
        exit
    mode = sys.argv[1]
    
    populate_referenced_species_databases(protein_list, referenced_species)
    
    if (mode == "blastn"):
        populate_blastn_alignments(protein_list)
    elif (mode == "tblastn"):
        populate_tblastn_alignments(protein_list)
    elif (mode == "SW_gene"):
        populate_SW_gene_alignments(protein_list)
    elif (mode == "SW_exon"):
        populate_SW_exon_alignments(protein_list)
    elif (mode == "all"):
        populate_blastn_alignments(protein_list)
        populate_tblastn_alignments(protein_list)
        populate_SW_gene_alignments(protein_list)
        populate_SW_exon_alignments(protein_list)
    else:
        print "Usage: {0} <blastn | tblastn | SW_gene | SW_exon | all> \n".format(sys.argv[0])
        exit
Ejemplo n.º 2
0
def create_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    
    
    
    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        
        exoloc_proteins = []
        ensembl_proteins = []
        
        exoloc_proteins.append(ref_prot_rec)
        ensembl_proteins.append(ref_prot_rec)
        
        assembled_dir = dc.get_assembled_protein_path(prot_id)
        for fasta in sorted(os.listdir(assembled_dir)):
            if fasta == "Homo_sapiens.fa":
                continue
            abs_fasta = "%s/%s" % (assembled_dir, fasta)
            prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein)
            exoloc_proteins.append(prot_rec)
            
        species_list = get_species_list(prot_id, None)
        for species in species_list:
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            ensembl_proteins.append(prot_rec) 
            
        msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id)
        msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id)
        
        write_seq_records_to_file(msa_exoloc_path, exoloc_proteins)
        write_seq_records_to_file(msa_ensembl_path, ensembl_proteins)
        
        cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id))
        print cmd
        os.system(cmd)
        cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id))
        print  cmd
        os.system(cmd)
Ejemplo n.º 3
0
def create_species_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    

    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        ref_prot_rec.id = "Homo_sapiens"

        assembled_dir = dc.get_assembled_protein_path(prot_id)    
        species_list = get_species_list(prot_id, None)
        
        for species in species_list:
            
            protein_recs = []
            protein_recs.append(ref_prot_rec)
            
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            protein_recs.append(prot_rec)
            
            if "%s.fa" % species in os.listdir(assembled_dir):
                exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein)
                protein_recs.append(exoloc_protein_rec)

            
            msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species)
            
            if len(protein_recs) == 1:
                continue
            write_seq_records_to_file(msa_species_path, protein_recs)
            
            cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species))
            print cmd
            os.system(cmd)
            
            os.remove(msa_species_path)
Ejemplo n.º 4
0
def main ():
    fill_all_containers(False)
    generate_SW_exon_alignments2("ENSP00000341765")
        for ref_exon in self.ref_exons.get_coding_exons():
            bea = self.best_exons[ref_exon.exon_id]
            if not bea:
                whole_protein_cdna += "N" * len(ref_exon.sequence)
            elif bea.status in ["both", "ensembl"]:
                whole_protein_cdna += bea.ensembl_alignment.get_cDNA(len(whole_protein_cdna))
            else:
                whole_protein_cdna += bea.sw_gene_alignment.create_cDNA()     
                
        return whole_protein_cdna.translate()
                
                
        
            


if __name__ == '__main__':

    fill_all_containers(True)
    bpp = BestProteinProduct("ENSP00000341765", "Ailuropoda_melanoleuca", "Homo_sapiens")
    bpp.decide_on_best_exon_alignments()
    bpp.patch_interexon_AAS()
    print bpp.export_spec_protein_translation()
    
    be = bpp.best_exons
    print
    
        
    
Ejemplo n.º 6
0
def main():
    fill_all_containers(True)
    ec = ExonContainer.Instance()
    # translate_alignment_exons()

    create_statistics(get_protein_list())
Ejemplo n.º 7
0
def main ():
    #ERROR FILE:::
    err_f = open('/home/marioot/err_status_monday.txt', 'w')

    fill_all_containers(True)
    
    protein_tuples = get_protein_list()
    ec = ExonContainer.Instance()
    beac = BestExonAlignmentContainer.Instance()
    dc = DirectoryCrawler()
    
    for (protein_id, exon_num) in protein_tuples:
        
        if int(exon_num) > 15:
            print "too big"
            continue
        
        species_list = get_species_list(protein_id, None)
        try:
            ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl"))
        except KeyError:
            print "ERROR: No protein %s" % protein_id
            continue
        
        for species in species_list:
            try:
                print "\nBest_exon_al: %s, %s" % (protein_id, species)
                err_f.write("%s, %s" % (protein_id, species))
                
                bpp = BestProteinProduct (protein_id, species, "Homo_sapiens")
                bpp.load_alignments()
                bpp.decide_on_best_exons()
                #bpp.patch_interexon_AAS()
                
                for ref_exon in ref_exons.get_coding_exons():
                    
                    best_exon_alignment = bpp.best_exons[ref_exon.exon_id]
                    if best_exon_alignment:
                        beac.add(ref_exon.exon_id, species, best_exon_alignment)
                        print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id)
                        if best_exon_alignment.sw_gene_alignment:
                            print ref_exon.sequence[ref_exon.frame:].translate()
                            best_exon_alignment.sw_gene_alignment.create_cDNA()
                            print "\tAdded  %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces))
                            for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces:
                                print "\t\t%s:" % (al_piece.type),
                                if al_piece.type in ["coding", "insertion"]:
                                    print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start,
                                                                            al_piece.ref_protein_stop,
                                                                            al_piece.genomic_start, 
                                                                            al_piece.genomic_stop, 
                                                                            al_piece.sequence_id)
                                    print "\t\t\tHUMAN:", al_piece.ref_protein_seq
                                    print "\t\t\tSPEC :", al_piece.spec_protein_seq
                                else:
                                    print
                                    
                whole_prot =  bpp.get_spec_protein_translation()
                whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein")
                file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species)
                SeqIO.write(whole_prot_rec, file_name, "fasta")
                
                print beac.get("ENSE00002199725", species)
            except Exception, e:
                print '{0} {1} \n'.format(protein_id, species)
                err_f.write('{0} {1} \n'.format(protein_id, species))