def test_read_file(self):
        text = 'Hello World!'

        # read regular file
        fp = join(self.tmpdir, 'test.txt')
        with open(fp, 'w') as f:
            f.write(text)
        with read_file(fp) as f:
            obs = f.read()
        self.assertEqual(obs, text)

        # read compressed file
        fp = join(self.tmpdir, 'test.txt.gz')
        with gzip.open(fp, 'wb') as f:
            f.write(text.encode())
        with read_file(fp) as f:
            obs = f.read()
        self.assertEqual(obs, text)
        remove(fp)
Exemple #2
0
    def read_search_results(file, maxhits=None):
        """Read homology search results of one sample.

        Parameters
        ----------
        file : str
            input filepath
        maxhits : int
            maximum number of hits per protein to preserve

        Returns
        -------
        list of dict
            search results
        """
        p = re.compile(r'# (\S+): (.*)')
        data = []
        with read_file(file) as f:
            for line in f:
                line = line.rstrip('\r\n')
                m = p.match(line)
                if m:
                    if m.group(1) == 'ID':
                        data.append({'id': m.group(2), 'hits': []})
                    elif m.group(1) == 'Length':
                        data[-1]['length'] = int(m.group(2))
                    elif m.group(1) == 'Product':
                        data[-1]['product'] = m.group(2)
                    elif m.group(1) == 'Score':
                        data[-1]['score'] = float(m.group(2))
                else:
                    data[-1]['hits'].append(line)
                    if len(data[-1]['hits']) == (maxhits or 0):
                        break

        # convert hit table to DataFrame
        for i in range(len(data)):
            data[i]['hits'] = pd.read_csv(StringIO('\n'.join(data[i]['hits'])),
                                          sep='\t',
                                          na_values='*',
                                          names=[
                                              'id', 'identity', 'evalue',
                                              'score', 'coverage', 'taxid'
                                          ],
                                          dtype={
                                              'id': str,
                                              'identity': np.float32,
                                              'evalue': np.float64,
                                              'score': np.float32,
                                              'coverage': np.float32,
                                              'taxid': str
                                          }).set_index('id')
        return data
Exemple #3
0
    def read_search_results(file,
                            maxhits=None,
                            evalue=None,
                            identity=None,
                            coverage=None):
        """Read homology search results of one sample.

        Parameters
        ----------
        file : str
            input filepath
        maxhits : int
            maximum number of hits per protein to preserve
        evalue : float
            maximum E-value cutoff
        identity : int
            minimum percent identity cutoff
        coverage : int
            minimum percent query coverage cutoff

        Returns
        -------
        list of dict
            search results
        """
        # read research result file
        p = re.compile(r'# (\S+): (.*)')
        data = []
        with read_file(file) as f:
            for line in f:
                line = line.rstrip('\r\n')
                m = p.match(line)
                if m:
                    key = m.group(1)
                    if key == 'ID':
                        data.append({'id': m.group(2), 'hits': []})
                    elif key == 'Length':
                        data[-1]['length'] = int(m.group(2))
                    elif key == 'Product':
                        data[-1]['product'] = m.group(2)
                    elif key == 'Score':
                        data[-1]['score'] = float(m.group(2))
                else:
                    data[-1]['hits'].append(line)

        for i in range(len(data)):
            # convert hit table to DataFrame
            hits = pd.read_csv(StringIO('\n'.join(data[i]['hits'])),
                               sep='\t',
                               na_values='*',
                               names=[
                                   'id', 'identity', 'evalue', 'score',
                                   'coverage', 'taxid'
                               ],
                               dtype={
                                   'id': str,
                                   'identity': np.float32,
                                   'evalue': np.float64,
                                   'score': np.float32,
                                   'coverage': np.float32,
                                   'taxid': str
                               }).set_index('id')

            # filter hits by thresholds
            hits = hits.query(
                'evalue <= {} & identity >= {} & coverage >= {}'.format(
                    (evalue or 100), (identity or 0), (coverage or 0)))

            # limit number of hits
            data[i]['hits'] = hits.head(maxhits) if maxhits else hits
        return data