Esempio n. 1
0
 def test_parse_cdhit_clstr_file(self):
     """parse_cdhit_clstr_file returns the correct clusters"""
     data = cdhit_clstr_file.split('\n')
     exp = [['seq0'],['seq1','seq10','seq3','seq23','seq145'],\
            ['seq7','seq17','seq69','seq1231']]
     obs = parse_cdhit_clstr_file(data)
     self.assertEqual(obs, exp)
Esempio n. 2
0
 def test_parse_cdhit_clstr_file(self):
     """parse_cdhit_clstr_file returns the correct clusters"""
     data = cdhit_clstr_file.split('\n')
     exp = [['seq0'],['seq1','seq10','seq3','seq23','seq145'],\
            ['seq7','seq17','seq69','seq1231']]
     obs = parse_cdhit_clstr_file(data)
     self.assertEqual(obs, exp)
Esempio n. 3
0
def parse_cdhit_clusters(cfname,
                         output=None,
                         cull_empty=False,
                         plate_labels=True,
                         **kwargs):

    print >> sys.stderr, 'Parsing Cluster file...'
    if isinstance(cfname, file):
        cfhandle = cfname
    else:
        cfhandle = open(os.path.join(cfname)).readlines()

    clines = [line.rstrip() for line in cfhandle]
    clstrs = cd_hit.parse_cdhit_clstr_file(clines)
    #cull clusters with one member
    if cull_empty: clstrs = filter(lambda i: len(i) > 1, clstrs)
    #print 2 formats: Seq\tWell Name\tLibrary Member
    #                 Library Member\tWell1\tWell2\tetc...

    #if well member follows regex (\d+-*\d*)-(\d+), it is a well
    queries = {}
    lib_mems = {}

    for clstr in clstrs:

        #NOTE: this assumes that second DB will always come first,
        #unsure if CD-HIT does this cannonically...(IT DOESNT)

        lib_mem = clstr.pop(0)
        lib_mems[lib_mem] = []
        for member in clstr:

            if plate_labels:
                sanger_pos = sample.parse_sanger_id(member)
                queries[sanger_pos] = lib_mem
                lib_mems[lib_mem].append(sanger_pos)
            else:
                queries[member] = lib_mem
                lib_mems[lib_mem].append(member)

    return queries, lib_mems
Esempio n. 4
0
    def detect_high_homology_pairs(self, seq_list):
        """Given a list of sequences, performs an all-to-all alignment
        and returns pairs of sequences (as indexes in the original list),
        which have a higher than desired homology.

        Args:
            seq_list: List of sequence strings. Order matters in that the
                pairs of indeces returned are relative to the ordering of the
                list.

        Returns:
            A List<Set<int>> of indeces that corresponding to sequences that
            were found to have homology above the given threshold with each
            other.
        """
        # Make sure the tmp directory
        if not os.path.exists(TMP_DATA_DIR):
            os.mkdir(TMP_DATA_DIR)

        input_filename = os.path.join(TMP_DATA_DIR, 'test_cdhit_input.fasta')
        output_prefix = os.path.join(TMP_DATA_DIR, 'test_cdhit_output')

        # Write the list of sequences to a file in .fasta format, as expected
        # by the cd-hit command.
        with open(input_filename, 'w') as input_fh:
            for seq_index in range(len(seq_list)):
                # For each sequence write the following two lines.
                # >0
                # ATGAGATAGTA
                seq = seq_list[seq_index]
                input_fh.write('>' + str(seq_index) + '\n' + str(seq) + '\n')

        # Call cd-hit.
        cmd = [
            'cd-hit',
            '-i',
            input_filename,
            '-o',
            output_prefix,
            '-c',
            str(self.threshold),
            '-n',
            str(self.word_size),
        ]
        if self.debug:
            stdout = None
        else:
            devnull = open(os.devnull, 'wb')
            stdout = devnull
        subprocess.call(cmd, stdout=stdout)

        # Parse the results.
        cluster_output_filename = output_prefix + '.clstr'
        with open(cluster_output_filename) as cluster_output_fh:
            cluster_lines = [line.rstrip() for line in cluster_output_fh]

        cluster_list = cogent_cd_hit_parser.parse_cdhit_clstr_file(
            cluster_lines)

        # The homology conflicts are clusters with more than one element.
        homology_clusters = []
        for cluster in cluster_list:
            if len(cluster) > 1:
                homology_clusters.append(
                    set([int(seq_index_str) for seq_index_str in cluster]))

        return homology_clusters