% (len(sequences_full))) # Do an initial trimming to remove excessively gapped positions - this is critical for building a correct ATS print("Trimming alignment for highly gapped positions (80% or more).") alg_out, poskeep = sca.filterPos(sequences_full, [1], 0.8) sequences_ori = sequences_full sequences_full = alg_out print("Alignment size post-trimming: %i positions" % len(sequences_full[0])) # If i_ref is directly provided, we use it, ignoring all else. Otherwise, we explore the other ways of specifying a reference sequences: (1) providing a PDBid (chainID defaults to 'A'), (2) providing the protein sequence with position numbers (defaults to just sequence numbering). If none of these is provided, we just make an alignment based numbering for ats. If a PDBid is provided, there is an option to also provide species information to permit identifying the reference sequence in the MSA without use of external packages for fast pairwise alignments. if options.i_ref is None: if options.pdbid is not None: try: seq_pdb, ats_pdb, dist_pdb = sca.pdbSeq( options.pdbid, options.chainID) if options.species is not None: try: print( "Finding reference sequence using species-based best match.." ) i_ref = sca.MSAsearch(headers_full, sequences_full, seq_pdb, options.species) Options_ref = i_ref print("reference sequence index is: %i" % (i_ref)) print(headers_full[i_ref]) print(sequences_full[i_ref]) except: print( "Cant find the reference sequence using species-based best_match! Using global MSAsearch..." )
sequences_full = alg_out print_("Alignment size post-trimming: {:d} positions".format(len(sequences_full[0]))) # If i_ref is directly provided, we use it, ignoring all else. Otherwise, # we explore the other ways of specifying a reference sequences: (1) # providing a PDBid (chainID defaults to 'A'), (2) providing the protein # sequence with position numbers (defaults to just sequence numbering). # If none of these is provided, we just make an alignment based numbering # for ats. If a PDBid is provided, there is an option to also provide # species information to permit identifying the reference sequence in the # MSA without use of external packages for fast pairwise alignments. if options.i_ref is None: if options.pdbid is not None: try: seq_pdb, ats_pdb, dist_pdb = sca.pdbSeq(options.pdbid, options.chainID) if options.species is not None: try: print_("Finding reference sequence using species-based best match..") i_ref = sca.MSAsearch( headers_full, sequences_full, seq_pdb, options.species) Options_ref = i_ref print_("reference sequence index is: {:d}".format(i_ref)) print_(headers_full[i_ref]) print_(sequences_full[i_ref]) except: print_("Cant find the reference sequence using species-based best_match! Using global MSAsearch...") try: i_ref = sca.MSAsearch(headers_full, sequences_full, seq_pdb) options.i_ref = i_ref print_("reference sequence index is: {:d}".format(i_ref))