def extract_seq_ids(data, fmt='fasta', variant=None): """ Given FASTQ-format data (string), parse out only the sequence IDs and return. """ fh = StringIO(data) if fmt == 'fastq': sc = SequenceCollection.read(fh, format=fmt, variant=variant) else: sc = SequenceCollection.read(fh, format=fmt) return frozenset(entry.id for entry in sc)
def convert_phylip(infile, outfile, format): seqs = SequenceCollection.read( infile, format='phylip', data_parser=phylip.relaxed_ids ) seqs.write(outfile, format=format)
def main(): args = handle_program_options() if osp.isfile(args.out_dir): print("--out_dir (-o) option must be a valid directory and not a file", file=sys.stderr) sys.exit(1) # will fail gracefully if dir exists skbu.create_dir(args.out_dir) metagenomes = [] if args.metagenome_id is not None: metagenomes.append(args.metagenome_id) elif args.metagenome_file is not None: metagenomes.extend(parse_metagenome_file(args.metagenome_file)) if args.verbose: msg = 'Processing requested for {} metagenome(s) found in: {}' print(msg.format(len(metagenomes), args.metagenome_file)) # MG-RAST stage.file ids for downloading derep_passed = '150.1' screen_passed = '299.1' for mg_id in metagenomes: if args.verbose: print('Processing metagenome: {}'.format(mg_id)) print('\tDownloading: Dereplication Passed...', end='') sys.stdout.flush() derepp_rsp = mgapi.mgrast_request('download', mg_id, {'file': derep_passed}, auth_key=args.auth_key) derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text), format='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(derepp_sc))) print('\tDownloading: Screen Passed...', end='') sys.stdout.flush() screenp_rsp = mgapi.mgrast_request('download', mg_id, {'file': screen_passed}, auth_key=args.auth_key) screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(screenp_ids))) # filter dereplication passed with IDs from screen passed failed_screen = filter_seqs(derepp_sc, screenp_ids) if args.verbose: nsp = len(screenp_ids) print('\tRemoved {} sequences from Dereplication Passed'.format(nsp)) print('\tleaving {} sequences'.format(len(failed_screen))) out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq') failed_screen.write(out_fp, format='fastq', variant='illumina1.8') if args.verbose: print('Sequence data written to: ' + out_fp)
def test_make_mini_otu_files(self): os.system("mkdir tmp") self.extension_seqs = SequenceCollection.read(self.extension_seqs) result = _make_mini_otu_files(self.key_node, self.extension_genus_dic_few, self.extension_seqs) os.system("rm -r tmp") self.assertEqual(result, """>P1\nTTAAAAAA\n""")
from qiime_default_reference import get_template_alignment, get_reference_sequences from skbio import SequenceCollection gapped_sequences = [ (s.id, str(s)) for s in SequenceCollection.read(get_template_alignment()) ][:500] sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_reference_sequences())][:500] motif_1 = "GGTGCAAGCCGGTGGAAACA" def pairwise(l): res = [] i = iter(l) for a, b in zip(i, i): s = min(len(a), len(b)) res.append((a[:s], b[:s])) return res
def main(): args = handle_program_options() if osp.isfile(args.out_dir): print("--out_dir (-o) option must be a valid directory and not a file", file=sys.stderr) sys.exit(1) # will fail gracefully if dir exists skbu.create_dir(args.out_dir) metagenomes = [] if args.metagenome_id is not None: metagenomes.append(args.metagenome_id) elif args.metagenome_file is not None: metagenomes.extend(parse_metagenome_file(args.metagenome_file)) if args.verbose: msg = 'Processing requested for {} metagenome(s) found in: {}' print(msg.format(len(metagenomes), args.metagenome_file)) # MG-RAST stage.file ids for downloading derep_passed = '150.1' screen_passed = '299.1' for mg_id in metagenomes: if args.verbose: print('Processing metagenome: {}'.format(mg_id)) print('\tDownloading: Dereplication Passed...', end='') sys.stdout.flush() derepp_rsp = mgapi.mgrast_request('download', mg_id, {'file': derep_passed}, auth_key=args.auth_key) derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text), format='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(derepp_sc))) print('\tDownloading: Screen Passed...', end='') sys.stdout.flush() screenp_rsp = mgapi.mgrast_request('download', mg_id, {'file': screen_passed}, auth_key=args.auth_key) screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(screenp_ids))) # filter dereplication passed with IDs from screen passed failed_screen = filter_seqs(derepp_sc, screenp_ids) if args.verbose: nsp = len(screenp_ids) print( '\tRemoved {} sequences from Dereplication Passed'.format(nsp)) print('\tleaving {} sequences'.format(len(failed_screen))) out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq') failed_screen.write(out_fp, format='fastq', variant='illumina1.8') if args.verbose: print('Sequence data written to: ' + out_fp)
from qiime_default_reference import get_template_alignment, get_reference_sequences from skbio import SequenceCollection gapped_sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_template_alignment())][:500] sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_reference_sequences())][:500] motif_1 = "GGTGCAAGCCGGTGGAAACA"
log_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] parser.add_argument( '--log-level', '-l', default="INFO", choices=log_choices, help="Set logging level. Default is info." ) return parser if __name__ == '__main__': parser = get_argument_parser() args = parser.parse_args() level = getattr(logging, args.log_level.upper(), logging.INFO) logging.basicConfig(level=level) sequences = SequenceCollection.read(args.infile, format=args.format) if args.parallel == 0 and len(sequences) > 16: pool_size = multiprocessing.cpu_count() else: pool_size = 1 dmatrix = create_distance_matrix(sequences, d2.distance, pool_size, statistic=d2.d2_neighbourhood_dna) print(dmatrix) phylo_tree = nj(dmatrix) print(phylo_tree.ascii_art()) phylo_tree.write(args.outfile, format=args.target)