def test_parse_cdhit_clstr_file(self): """parse_cdhit_clstr_file returns the correct clusters""" data = cdhit_clstr_file.split('\n') exp = [['seq0'],['seq1','seq10','seq3','seq23','seq145'],\ ['seq7','seq17','seq69','seq1231']] obs = parse_cdhit_clstr_file(data) self.assertEqual(obs, exp)
def parse_cdhit_clusters(cfname, output=None, cull_empty=False, plate_labels=True, **kwargs): print >> sys.stderr, 'Parsing Cluster file...' if isinstance(cfname, file): cfhandle = cfname else: cfhandle = open(os.path.join(cfname)).readlines() clines = [line.rstrip() for line in cfhandle] clstrs = cd_hit.parse_cdhit_clstr_file(clines) #cull clusters with one member if cull_empty: clstrs = filter(lambda i: len(i) > 1, clstrs) #print 2 formats: Seq\tWell Name\tLibrary Member # Library Member\tWell1\tWell2\tetc... #if well member follows regex (\d+-*\d*)-(\d+), it is a well queries = {} lib_mems = {} for clstr in clstrs: #NOTE: this assumes that second DB will always come first, #unsure if CD-HIT does this cannonically...(IT DOESNT) lib_mem = clstr.pop(0) lib_mems[lib_mem] = [] for member in clstr: if plate_labels: sanger_pos = sample.parse_sanger_id(member) queries[sanger_pos] = lib_mem lib_mems[lib_mem].append(sanger_pos) else: queries[member] = lib_mem lib_mems[lib_mem].append(member) return queries, lib_mems
def detect_high_homology_pairs(self, seq_list): """Given a list of sequences, performs an all-to-all alignment and returns pairs of sequences (as indexes in the original list), which have a higher than desired homology. Args: seq_list: List of sequence strings. Order matters in that the pairs of indeces returned are relative to the ordering of the list. Returns: A List<Set<int>> of indeces that corresponding to sequences that were found to have homology above the given threshold with each other. """ # Make sure the tmp directory if not os.path.exists(TMP_DATA_DIR): os.mkdir(TMP_DATA_DIR) input_filename = os.path.join(TMP_DATA_DIR, 'test_cdhit_input.fasta') output_prefix = os.path.join(TMP_DATA_DIR, 'test_cdhit_output') # Write the list of sequences to a file in .fasta format, as expected # by the cd-hit command. with open(input_filename, 'w') as input_fh: for seq_index in range(len(seq_list)): # For each sequence write the following two lines. # >0 # ATGAGATAGTA seq = seq_list[seq_index] input_fh.write('>' + str(seq_index) + '\n' + str(seq) + '\n') # Call cd-hit. cmd = [ 'cd-hit', '-i', input_filename, '-o', output_prefix, '-c', str(self.threshold), '-n', str(self.word_size), ] if self.debug: stdout = None else: devnull = open(os.devnull, 'wb') stdout = devnull subprocess.call(cmd, stdout=stdout) # Parse the results. cluster_output_filename = output_prefix + '.clstr' with open(cluster_output_filename) as cluster_output_fh: cluster_lines = [line.rstrip() for line in cluster_output_fh] cluster_list = cogent_cd_hit_parser.parse_cdhit_clstr_file( cluster_lines) # The homology conflicts are clusters with more than one element. homology_clusters = [] for cluster in cluster_list: if len(cluster) > 1: homology_clusters.append( set([int(seq_index_str) for seq_index_str in cluster])) return homology_clusters