コード例 #1
0
#!/usr/bin/env python
'''Read and process blastp output file.'''

from blastplus import BlastProcessor, BLASTP_OUTPUT_FILE

bp = BlastProcessor(BLASTP_OUTPUT_FILE)

# Write the subject ids to disk for sequence retrieval
bp.uniq_hits().to_csv('../data/analyze-output/subject_ids.ref',
                      header=False, index=False)
for query_file in query_files:
    with open(query_file, 'rU') as f:
        query_ids = []
        for line in f.xreadlines():
            if line[0] == COMMENT:
                query_ids += [line[1:-1]] # Leave out newline character
    filenames = len(query_ids) * [os.path.basename(query_file)]
    df_query = pd.DataFrame({
        'filename': filenames,
        'query id': query_ids,
    })
    df_queries = df_queries.append(df_query)
df_queries = df_queries.reset_index()

# Create DataFrame of blast results
bp = BlastProcessor(BLASTP_OUTPUT_FILE)
df_blast_results = bp.top_hits()

# Merge the DataFrames to get the filename and blast hits
df = pd.merge(df_queries, df_blast_results)
df = df[['filename', 'subject id']]
mask_dups = df.duplicated()
df = df[~mask_dups]             # Get unique values

# Concatenate the query and subject sequences into the alignment files
for query_filename, group in df.groupby('filename'):
    query_file = os.path.join(QUERY_DIR, query_filename)
    align_filename = os.path.splitext(query_filename)[0] + '.cat.faa'
    align_file = os.path.join(ALIGN_DIR, align_filename)
    with open(query_file, 'rU') as f:
        query = f.readlines()