Ejemplo n.º 1
0
def extract_seq_ids(data, fmt='fasta', variant=None):
    """
    Given FASTQ-format data (string), parse out only the
    sequence IDs and return.
    """
    fh = StringIO(data)
    if fmt == 'fastq':
        sc = SequenceCollection.read(fh, format=fmt, variant=variant)
    else:
        sc = SequenceCollection.read(fh, format=fmt)
    return frozenset(entry.id for entry in sc)
Ejemplo n.º 2
0
def extract_seq_ids(data, fmt='fasta', variant=None):
    """
    Given FASTQ-format data (string), parse out only the
    sequence IDs and return.
    """
    fh = StringIO(data)
    if fmt == 'fastq':
        sc = SequenceCollection.read(fh, format=fmt, variant=variant)
    else:
        sc = SequenceCollection.read(fh, format=fmt)
    return frozenset(entry.id for entry in sc)
Ejemplo n.º 3
0
def convert_phylip(infile, outfile, format):
    seqs = SequenceCollection.read(
        infile, format='phylip',
        data_parser=phylip.relaxed_ids
    )

    seqs.write(outfile, format=format)
Ejemplo n.º 4
0
def main():
    args = handle_program_options()

    if osp.isfile(args.out_dir):
        print("--out_dir (-o) option must be a valid directory and not a file",
              file=sys.stderr)
        sys.exit(1)

    # will fail gracefully if dir exists
    skbu.create_dir(args.out_dir)

    metagenomes = []
    if args.metagenome_id is not None:
        metagenomes.append(args.metagenome_id)
    elif args.metagenome_file is not None:
        metagenomes.extend(parse_metagenome_file(args.metagenome_file))
        
    if args.verbose:
        msg = 'Processing requested for {} metagenome(s) found in: {}'
        print(msg.format(len(metagenomes), args.metagenome_file))

    # MG-RAST stage.file ids for downloading
    derep_passed = '150.1'
    screen_passed = '299.1'

    for mg_id in metagenomes:
        if args.verbose:
            print('Processing metagenome: {}'.format(mg_id))
            print('\tDownloading: Dereplication Passed...', end='')
            sys.stdout.flush()
        derepp_rsp = mgapi.mgrast_request('download', mg_id,
                                          {'file': derep_passed},
                                          auth_key=args.auth_key)
        derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text),
                                            format='fastq',
                                            variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(derepp_sc)))
            print('\tDownloading: Screen Passed...', end='')
            sys.stdout.flush()
        screenp_rsp = mgapi.mgrast_request('download', mg_id,
                                           {'file': screen_passed},
                                           auth_key=args.auth_key)
        screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq',
                                      variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(screenp_ids)))

        # filter dereplication passed with IDs from screen passed
        failed_screen = filter_seqs(derepp_sc, screenp_ids)
        if args.verbose:
            nsp = len(screenp_ids)
            print('\tRemoved {} sequences from Dereplication Passed'.format(nsp))
            print('\tleaving {} sequences'.format(len(failed_screen)))

        out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq')
        failed_screen.write(out_fp, format='fastq', variant='illumina1.8')
        if args.verbose:
            print('Sequence data written to: ' + out_fp)
Ejemplo n.º 5
0
 def test_make_mini_otu_files(self):
     os.system("mkdir tmp")
     self.extension_seqs = SequenceCollection.read(self.extension_seqs)
     result = _make_mini_otu_files(self.key_node,
                                   self.extension_genus_dic_few,
                                   self.extension_seqs)
     os.system("rm -r tmp")
     self.assertEqual(result, """>P1\nTTAAAAAA\n""")
Ejemplo n.º 6
0
 def test_make_mini_otu_files(self):
     os.system("mkdir tmp")
     self.extension_seqs = SequenceCollection.read(self.extension_seqs)
     result = _make_mini_otu_files(self.key_node,
                                   self.extension_genus_dic_few,
                                   self.extension_seqs)
     os.system("rm -r tmp")
     self.assertEqual(result, """>P1\nTTAAAAAA\n""")
Ejemplo n.º 7
0
from qiime_default_reference import get_template_alignment, get_reference_sequences

from skbio import SequenceCollection

gapped_sequences = [
    (s.id, str(s)) for s in SequenceCollection.read(get_template_alignment())
][:500]

sequences = [(s.id, str(s))
             for s in SequenceCollection.read(get_reference_sequences())][:500]

motif_1 = "GGTGCAAGCCGGTGGAAACA"


def pairwise(l):
    res = []
    i = iter(l)
    for a, b in zip(i, i):
        s = min(len(a), len(b))
        res.append((a[:s], b[:s]))
    return res
Ejemplo n.º 8
0
def main():
    args = handle_program_options()

    if osp.isfile(args.out_dir):
        print("--out_dir (-o) option must be a valid directory and not a file",
              file=sys.stderr)
        sys.exit(1)

    # will fail gracefully if dir exists
    skbu.create_dir(args.out_dir)

    metagenomes = []
    if args.metagenome_id is not None:
        metagenomes.append(args.metagenome_id)
    elif args.metagenome_file is not None:
        metagenomes.extend(parse_metagenome_file(args.metagenome_file))

    if args.verbose:
        msg = 'Processing requested for {} metagenome(s) found in: {}'
        print(msg.format(len(metagenomes), args.metagenome_file))

    # MG-RAST stage.file ids for downloading
    derep_passed = '150.1'
    screen_passed = '299.1'

    for mg_id in metagenomes:
        if args.verbose:
            print('Processing metagenome: {}'.format(mg_id))
            print('\tDownloading: Dereplication Passed...', end='')
            sys.stdout.flush()
        derepp_rsp = mgapi.mgrast_request('download',
                                          mg_id, {'file': derep_passed},
                                          auth_key=args.auth_key)
        derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text),
                                            format='fastq',
                                            variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(derepp_sc)))
            print('\tDownloading: Screen Passed...', end='')
            sys.stdout.flush()
        screenp_rsp = mgapi.mgrast_request('download',
                                           mg_id, {'file': screen_passed},
                                           auth_key=args.auth_key)
        screenp_ids = extract_seq_ids(screenp_rsp.text,
                                      fmt='fastq',
                                      variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(screenp_ids)))

        # filter dereplication passed with IDs from screen passed
        failed_screen = filter_seqs(derepp_sc, screenp_ids)
        if args.verbose:
            nsp = len(screenp_ids)
            print(
                '\tRemoved {} sequences from Dereplication Passed'.format(nsp))
            print('\tleaving {} sequences'.format(len(failed_screen)))

        out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq')
        failed_screen.write(out_fp, format='fastq', variant='illumina1.8')
        if args.verbose:
            print('Sequence data written to: ' + out_fp)
Ejemplo n.º 9
0
from qiime_default_reference import get_template_alignment, get_reference_sequences

from skbio import SequenceCollection

gapped_sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_template_alignment())][:500]

sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_reference_sequences())][:500]

motif_1 = "GGTGCAAGCCGGTGGAAACA"
Ejemplo n.º 10
0
    log_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
    parser.add_argument(
        '--log-level', '-l', default="INFO", choices=log_choices,
        help="Set logging level. Default is info."
    )

    return parser

if __name__ == '__main__':
    parser = get_argument_parser()
    args = parser.parse_args()

    level = getattr(logging, args.log_level.upper(), logging.INFO)
    logging.basicConfig(level=level)

    sequences = SequenceCollection.read(args.infile, format=args.format)

    if args.parallel == 0 and len(sequences) > 16:
        pool_size = multiprocessing.cpu_count()
    else:
        pool_size = 1

    dmatrix = create_distance_matrix(sequences, d2.distance, pool_size,
                                     statistic=d2.d2_neighbourhood_dna)

    print(dmatrix)
    phylo_tree = nj(dmatrix)
    print(phylo_tree.ascii_art())
    phylo_tree.write(args.outfile, format=args.target)