def test_read_file(self): text = 'Hello World!' # read regular file fp = join(self.tmpdir, 'test.txt') with open(fp, 'w') as f: f.write(text) with read_file(fp) as f: obs = f.read() self.assertEqual(obs, text) # read compressed file fp = join(self.tmpdir, 'test.txt.gz') with gzip.open(fp, 'wb') as f: f.write(text.encode()) with read_file(fp) as f: obs = f.read() self.assertEqual(obs, text) remove(fp)
def read_search_results(file, maxhits=None): """Read homology search results of one sample. Parameters ---------- file : str input filepath maxhits : int maximum number of hits per protein to preserve Returns ------- list of dict search results """ p = re.compile(r'# (\S+): (.*)') data = [] with read_file(file) as f: for line in f: line = line.rstrip('\r\n') m = p.match(line) if m: if m.group(1) == 'ID': data.append({'id': m.group(2), 'hits': []}) elif m.group(1) == 'Length': data[-1]['length'] = int(m.group(2)) elif m.group(1) == 'Product': data[-1]['product'] = m.group(2) elif m.group(1) == 'Score': data[-1]['score'] = float(m.group(2)) else: data[-1]['hits'].append(line) if len(data[-1]['hits']) == (maxhits or 0): break # convert hit table to DataFrame for i in range(len(data)): data[i]['hits'] = pd.read_csv(StringIO('\n'.join(data[i]['hits'])), sep='\t', na_values='*', names=[ 'id', 'identity', 'evalue', 'score', 'coverage', 'taxid' ], dtype={ 'id': str, 'identity': np.float32, 'evalue': np.float64, 'score': np.float32, 'coverage': np.float32, 'taxid': str }).set_index('id') return data
def read_search_results(file, maxhits=None, evalue=None, identity=None, coverage=None): """Read homology search results of one sample. Parameters ---------- file : str input filepath maxhits : int maximum number of hits per protein to preserve evalue : float maximum E-value cutoff identity : int minimum percent identity cutoff coverage : int minimum percent query coverage cutoff Returns ------- list of dict search results """ # read research result file p = re.compile(r'# (\S+): (.*)') data = [] with read_file(file) as f: for line in f: line = line.rstrip('\r\n') m = p.match(line) if m: key = m.group(1) if key == 'ID': data.append({'id': m.group(2), 'hits': []}) elif key == 'Length': data[-1]['length'] = int(m.group(2)) elif key == 'Product': data[-1]['product'] = m.group(2) elif key == 'Score': data[-1]['score'] = float(m.group(2)) else: data[-1]['hits'].append(line) for i in range(len(data)): # convert hit table to DataFrame hits = pd.read_csv(StringIO('\n'.join(data[i]['hits'])), sep='\t', na_values='*', names=[ 'id', 'identity', 'evalue', 'score', 'coverage', 'taxid' ], dtype={ 'id': str, 'identity': np.float32, 'evalue': np.float64, 'score': np.float32, 'coverage': np.float32, 'taxid': str }).set_index('id') # filter hits by thresholds hits = hits.query( 'evalue <= {} & identity >= {} & coverage >= {}'.format( (evalue or 100), (identity or 0), (coverage or 0))) # limit number of hits data[i]['hits'] = hits.head(maxhits) if maxhits else hits return data