class AlignmentTargetGenerator(object): ''' Class used to retrieve the list of all possible targets from certain alignment ''' def __init__(self): self.crawler = DirectoryCrawler() self.description_parser = DescriptionParser() def get_blastn_targets(self, protein_id): ''' @param protein_id: retrieves the list of species (RBS) not aligned with blastn for that protein ''' path = self.crawler.get_blastn_path(protein_id) return get_species_list(protein_id, path) def set_failed_blastn_targets(self, protein_id, failed_species_list): path = self.crawler.get_blastn_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_tblastn_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with tblastn for that protein ''' path = self.crawler.get_tblastn_path(protein_id) return get_species_list(protein_id, path) def set_failed_tblastn_targets(self, protein_id, failed_species_list): path = self.crawler.get_tblastn_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_SW_gene_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_gene for that protein ''' path = self.crawler.get_SW_gene_path(protein_id) return get_species_list(protein_id, path) def set_failed_SW_gene_targets(self, protein_id, failed_species_list): path = self.crawler.get_SW_gene_path(protein_id) write_failed_species_to_status(failed_species_list, path) def get_SW_exon_targets(self, protein_id): ''' @param protein_id: retrieves the list of species not aligned with SW_exon for that protein ''' path = self.crawler.get_SW_exon_path(protein_id) return get_species_list(protein_id, path) def set_failed_SW_exon_targets(self, protein_id, failed_species_list): path = self.crawler.get_SW_exon_path(protein_id) write_failed_species_to_status(failed_species_list, path)
def reset_action (protein_id, key): update_entry_in_status_file(protein_id, key, 'FAILED') crawler = DirectoryCrawler() if key == 'GENE_RETRIEVAL': clear_directory(crawler.get_gene_path(protein_id)) elif key == 'EXP_GENE_RETRIEVAL' : clear_directory(crawler.get_expanded_gene_path(protein_id)) elif key == 'PROTEIN_RETRIEVAL' : clear_directory(crawler.get_protein_path(protein_id)) elif key == 'ENSEMBL_EXON_RETRIEVAL' : clear_directory(crawler.get_exon_ensembl_path(protein_id)) elif key == 'GENEWISE_EXON_RETRIEVAL' : clear_directory(crawler.get_exon_genewise_path(protein_id)) clear_directory(crawler.get_genewise_path(protein_id)) elif key == 'REF_SP_DB_FORMATTING' : clear_directory(crawler.get_database_path(protein_id)) elif key == 'BLASTN_ALIGNMENT' : clear_directory(crawler.get_blastn_path(protein_id)) elif key == 'TBLASTN_ALIGNMENT' : clear_directory(crawler.get_tblastn_path(protein_id)) elif key == 'SW_GENE_ALIGNMENT' : clear_directory(crawler.get_SW_gene_path(protein_id)) elif key == 'SW_EXON_ALIGNMENT' : clear_directory(crawler.get_SW_exon_path(protein_id))
def generate_blastn_alignments(protein_id, species_list = None, referenced_species = "Homo_sapiens"): ''' Runs the blastn program for a specified protein and list of species @param protein_id @param species_list: if provided, runs blastn for this list of species, \ otherwise runs for species that are missing the blastn output \ who are determined by .status file in the blastn folder. ''' logger = Logger.Instance() alignment_logger = logger.get_logger('alignment') crawler = DirectoryCrawler() command_generator = CommandGenerator() alignment_generator = AlignmentTargetGenerator() failed_species_list = [] # retrieve the blastn targets if (not species_list): species_list = alignment_generator.get_blastn_targets(protein_id) for species in species_list: ############# MOVE TO ANOTHER FNC output_file = "{0}/{1}.blastout".format(crawler.get_blastn_path(protein_id), species.strip()) input_file = "{0}/{1}.fa".format(crawler.get_expanded_gene_path(protein_id), species.strip()) database = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species) command = command_generator.generate_blastn_command(database, input_file, output_file) command_return = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) output = command_return.stdout.read() if output != "": #LOGGING os.remove(output_file) alignment_logger.warning("{0}, {1}, BLASTN, {2}".format(protein_id, species.strip(), output.strip())) failed_species_list.append(species.strip()) if failed_species_list: alignment_generator.set_failed_blastn_targets(protein_id, failed_species_list) return False return True
def parse_blast_output (ref_protein_id, species, blast): ''' @return: Dictionary where key is reference species exon_id, and the value is list of corresponding alignments ''' logger = Logger.Instance() containers_logger = logger.get_logger('containers') dc = DirectoryCrawler() if blast == "blastn": blast_file = "{0}/{1}.blastout".format(dc.get_blastn_path(ref_protein_id), species) else: blast_file = "{0}/{1}.blastout".format(dc.get_tblastn_path(ref_protein_id), species) if not os.path.isfile(blast_file): containers_logger.error ("{0}, {1}, {2}, no blastout file".format(ref_protein_id, species, blast)) return None file_handle = open(blast_file, 'r') # parse blastn output try: blastn_record = NCBIXML.read(file_handle) except ValueError: containers_logger.error("%s,%s,%s,No hits found" % (ref_protein_id, species, blast)) return None exon_dict = {} exon_pattern = re.compile("(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)") for alignment in blastn_record.alignments: (blast_info, exon_info) = alignment.title.split() pattern_match = re.match(exon_pattern, exon_info) ref_exon_id = pattern_match.groups()[3] exon_start = int (pattern_match.groups()[0]) exon_end = int(pattern_match.groups()[1]) # limit alignments to 10 hsps num_of_hsps = 0 for hsp in alignment.hsps: # limit! if blast == "blastn": (query_frame, hit_frame) = hsp.frame if query_frame == -1 or hit_frame == -1: continue if num_of_hsps == 5: break num_of_hsps += 1 exon = Exon(blast, ref_exon_id, ref_protein_id, species) if type(hsp.gaps) is int: gaps = hsp.gaps elif type(hsp.gaps) is tuple: if not hsp.gaps[0]: gaps = 0 exon.set_alignment_info ( hsp.identities, hsp.positives, gaps, hsp.sbjct_start, hsp.sbjct_start + len(hsp.sbjct) -1, hsp.query_start, hsp.query_start + len(hsp.sbjct) -1, len(hsp.sbjct), hsp.sbjct, hsp.query, hsp.score) if not ref_exon_id in exon_dict: exon_dict[ref_exon_id] = [exon] else: exon_dict[ref_exon_id].append(exon) # means we covered the whole exon if len(hsp.sbjct) == abs(exon_end-exon_start)+1 and len(hsp.sbjct) == hsp.identities: break file_handle.close() return exon_dict