def __init__(self, external_name): if not os.path.isdir(PathResolver.datasets_output_path()): os.makedirs(PathResolver.datasets_output_path()) self.external_name = external_name self.internal_name = self.detect_internal_name() NameConverter.register(self.external_name) self._contigs_count = None
def _make_datasets(self): NameConverter.load() input_path = PathResolver.input_path() files = [n.split('/')[-1] for n in glob.glob(input_path + '/*')] files.sort() groups = {} for gid, els in itertools.groupby(files, self._extract_external_name): groups[gid] = list(els) for group, files in groups.iteritems(): if not group or len(files) < 3: continue file_group = {'reads_1': None, 'reads_2': None, 'contigs': None} for f in files: if re.match(DataManager.LEFT_READS_FNAME_REGEXP, f): file_group['reads_1'] = f elif re.match(DataManager.RIGHT_READS_FNAME_REGEXP, f): file_group['reads_2'] = f elif re.match(DataManager.CONTIGS_FNAME_REGEXP, f): file_group['contigs'] = f if not None in file_group.values(): ext_name = self._extract_external_name(file_group['contigs']) dataset = Dataset(ext_name) self.datasets.append(dataset) return self.datasets
def _find_dataset_file(self, file_name, extensions): path = PathResolver.input_path() paths = [ glob.glob('%s/%s.%s' % (path, file_name, ext)) for ext in extensions ] paths = sum(paths, []) if len(paths) > 0: return paths[0] else: return None
def perform(self): PathResolver.assure_path_exists(self._db_folder_path()) all_vs_all_path = PathResolver.output_path_for( PathResolver.ALL_VS_ALL_FOLDER) PathResolver.assure_path_exists(all_vs_all_path) one_vs_all_path = PathResolver.output_path_for( PathResolver.ONE_VS_ALL_FOLDER) PathResolver.assure_path_exists(one_vs_all_path) self._make_databases() self._perform_blasts() self._make_one_vs_all_files()
def __init__(self, dataset): self.dataset = dataset folder_path = PathResolver.output_path_for(self.MAPPINGS_FOLDER) PathResolver.assure_path_exists(folder_path) db_path = PathResolver.output_path_for(self.MAPPINGS_FOLDER, self.DB_FOLDER) PathResolver.assure_path_exists(db_path)
def _make_one_vs_all_files(self): all_vs_all_path = PathResolver.output_path_for( PathResolver.ALL_VS_ALL_FOLDER) files = {} for f_path in glob.glob(all_vs_all_path + '/*.%s' % self.BLAST_RESULT_EXT): left_org_id = os.path.splitext( os.path.basename(f_path))[0].split('_VS_')[0] if left_org_id not in files: files[left_org_id] = [] files[left_org_id].append(f_path) for org_id, files in files.iteritems(): org_file_name = org_id + '.' + self.BLAST_RESULT_EXT path = PathResolver.output_path_for(PathResolver.ONE_VS_ALL_FOLDER, org_file_name) if os.path.exists(path): os.remove(path) for f_path in files: subprocess.call('cat %s >> %s' % (f_path, path), shell=True)
def _perform_blast(self, left_path, right_path): command = 'blastn -query %s -db %s -out %s -outfmt "6 %s" -num_threads %s' right_org_name = os.path.splitext(os.path.basename(right_path))[0] left_org_name = os.path.splitext(os.path.basename(left_path))[0] db_path = self._db_folder_path(right_org_name) outfile_name = '%s_VS_%s.%s' % (left_org_name, right_org_name, self.BLAST_RESULT_EXT) output_path = PathResolver.output_path_for( PathResolver.ALL_VS_ALL_FOLDER, outfile_name) threads_cnt = Settings.winston.tools.blast.threads command = command % (left_path, db_path, output_path, self.COLUMNS, threads_cnt) subprocess.call(command, shell=True)
def main(args): logging.basicConfig(level=logging.getLevelName(args.logging)) logging.info('User args: %s' % pformat(args)) config = load_config_from_json_file( args.config, ['gap', 'same', 'diff', 'max_number_of_paths', 'max_sequence_length']) logging.info('Config is: \n%s' % pformat(config)) seq1 = load_fasta_file(args.input1) seq2 = load_fasta_file(args.input2) if config['max_sequence_length'] != 0 and max( len(seq1), len(seq2)) > config['max_sequence_length']: raise ValueError('Sequence exceeded max_sequence_length ') score_matrix, nodes_mapping = solve(seq1, seq2, config['gap'], config['diff'], config['same']) logging.debug('Score matrix: \n%s' % pformat(score_matrix)) logging.debug('Nodes mapping: (target_node): [(parent_node),...]\n%s' % pformat(nodes_mapping)) logging.info('Alignments score: %s' % score_matrix[len(seq1), len(seq2)]) paths = PathResolver(nodes_mapping).resolve_paths( len(seq1), len(seq2), config['max_number_of_paths']) allignments = [get_allignments(path, seq1, seq2) for path in paths] for (allignment_1, allignment_2), i in zip(allignments, range(len(allignments))): logging.info('[A%04d] %s' % (i, allignment_1)) logging.info('[A%04d] %s' % (i, allignment_2)) if args.output: save_output( args.output, { 'seq1': seq1, 'seq2': seq2, 'config': config, 'allignments': allignments, 'score_matrix': score_matrix.tolist() }) logging.info('Saved output to %s' % args.output)
def load(file_path=None): if not file_path: file_path = PathResolver.pair_types_path() if TypesManager.types_dict: return True with open(file_path, 'r') as f: result = {} for row in csv.reader(f): key = TypesManager.detect_key(row[0], row[1]) result[key] = { 'threshold': float(row[2]), 'type': row[3] } TypesManager.types_dict = result return True
def _get_output_path(self, input_path): file_name = os.path.basename(input_path) file_name = input_file_name.replace(file_name, self.internal_name) return PathResolver.datasets_output_path(file_name)
def _default_db_path(): return PathResolver.output_path_for( DatabaseWorker.DEFAULT_DB_FILE_NAME)
def pileup_output_path(self): file_name = '%s_pileup.txt' % self.dataset.external_name return PathResolver.output_path_for(self.MAPPINGS_FOLDER, file_name)
def reads_output_paths(self): input_names = [os.path.basename(p) for p in self.reads_input_paths()] return [PathResolver.datasets_output_path(p) for p in input_names]
def pair_types_path(): return PathResolver.output_path_for(PathResolver.TYPES_FILENAME)
def _db_folder_path(self, *inner_path): return PathResolver.output_path_for(PathResolver.BLAST_DB_FOLDER, *inner_path)
def _contigs_files(self): files = glob.glob(PathResolver.datasets_output_path() + '/*') return [ e for e in files if re.match(DataManager.CONTIGS_FNAME_REGEXP, e) ]
def contigs_output_path(self): file_name = os.path.basename(self.contigs_input_path()) return PathResolver.datasets_output_path(file_name)
def _dict_file_path(): return PathResolver.output_path_for(NameConverter.DICT_FILENAME)
def test_path_resolving_from_top_left(self): nodes_mapping = PathResolvingTest.get_nodes_mapping() path_resolver = PathResolver(nodes_mapping) paths = path_resolver.resolve_paths(0, 0) self.assertEqual(len(paths), 1)
def test_path_resolving_from_bottom_right(self): nodes_mapping = PathResolvingTest.get_nodes_mapping() path_resolver = PathResolver(nodes_mapping) paths = path_resolver.resolve_paths(4, 5) self.assertEqual(len(paths), 2)
def sam_file_path(self): file_name = '%s.sam' % self.dataset.external_name paths = [self.MAPPINGS_FOLDER, file_name] return PathResolver.output_path_for(*paths)
def test_path_resolving_from_bottom_right_with_restrictions(self): max_number_of_paths = 1 nodes_mapping = PathResolvingTest.get_nodes_mapping() path_resolver = PathResolver(nodes_mapping) paths = path_resolver.resolve_paths(4, 5, max_number_of_paths) self.assertEqual(len(paths), max_number_of_paths)
def db_path(self): return PathResolver.output_path_for(self.MAPPINGS_FOLDER, self.DB_FOLDER)
def _open_logs(self): for name, ext in self.FILES.iteritems(): f_name = '%s_%s.%s' % (self.external_name, name, ext) self.logs[name] = open(PathResolver.results_path_for(f_name), 'w')