def align_family(family, mate, stats): """Do a multiple sequence alignment of the reads in a family and their quality scores.""" mate = str(mate) assert mate == "1" or mate == "2" start = time.time() # Do the multiple sequence alignment. seq_alignment = make_msa(family, mate) if seq_alignment is None: return None # Transfer the alignment to the quality scores. seqs = [read["seq"] for read in seq_alignment] quals_raw = [pair["qual" + mate] for pair in family] qual_alignment = seqtools.transfer_gaps_multi(quals_raw, seqs, gap_char_out=" ") # Package them up in the output data structure. alignment = [] for aligned_seq, aligned_qual in zip(seq_alignment, qual_alignment): alignment.append({"name": aligned_seq["name"], "seq": aligned_seq["seq"], "qual": aligned_qual}) elapsed = time.time() - start pairs = len(family) logging.info("{} sec for {} read pairs.".format(elapsed, pairs)) if pairs > 1: stats["time"] += elapsed stats["pairs"] += pairs stats["runs"] += 1 return alignment
def align_family(family, mate, aligner='mafft'): """Do a multiple sequence alignment of the reads in a family and their quality scores.""" mate = str(mate) assert mate == '1' or mate == '2' if len(family) == 0: return None elif len(family) == 1: # If there's only one read pair, there's no alignment to be done (and MAFFT won't accept it). aligned_seqs = [family[0]['seq' + mate]] else: # Do the multiple sequence alignment. aligned_seqs = make_msa(family, mate, aligner=aligner) # Transfer the alignment to the quality scores. ## Get a list of all quality scores in the family for this mate. quals_raw = [pair['qual' + mate] for pair in family] qual_alignment = seqtools.transfer_gaps_multi(quals_raw, aligned_seqs, gap_char_out=' ') # Package them up in the output data structure. alignment = [] for pair, aligned_seq, aligned_qual in zip(family, aligned_seqs, qual_alignment): alignment.append({ 'name': pair['name' + mate], 'seq': aligned_seq, 'qual': aligned_qual }) return alignment
def realign_family_to_consensus(consensus, family, quals, validate=False): unaligned_seqs = [consensus] unaligned_seqs.extend([seq.replace('-', '') for seq in family]) aligned_seqs = kalign.align(unaligned_seqs) if validate: for input, output in zip(unaligned_seqs, aligned_seqs): assert input == output.replace('-', ''), ( "Kalign may have returned sequences in a different order than they were given. Failed on " "sequence: {}".format(input) ) aligned_consensus = aligned_seqs[0] aligned_family = aligned_seqs[1:] aligned_quals = seqtools.transfer_gaps_multi(quals, aligned_family, gap_char_out=' ') return aligned_consensus, aligned_family, aligned_quals
def align_family(family, mate): """Do a multiple sequence alignment of the reads in a family and their quality scores.""" mate = str(mate) assert mate == '1' or mate == '2' # Do the multiple sequence alignment. seq_alignment = make_msa(family, mate) if seq_alignment is None: return None # Transfer the alignment to the quality scores. ## Get a list of all sequences in the alignment (mafft output). seqs = [read['seq'] for read in seq_alignment] ## Get a list of all quality scores in the family for this mate. quals_raw = [pair['qual'+mate] for pair in family] qual_alignment = seqtools.transfer_gaps_multi(quals_raw, seqs, gap_char_out=' ') # Package them up in the output data structure. alignment = [] for aligned_seq, aligned_qual in zip(seq_alignment, qual_alignment): alignment.append({'name':aligned_seq['name'], 'seq':aligned_seq['seq'], 'qual':aligned_qual}) return alignment
def align_family(family, mate, aligner='mafft'): """Do a multiple sequence alignment of the reads in a family and their quality scores.""" mate = str(mate) assert mate == '1' or mate == '2' if len(family) == 0: return None elif len(family) == 1: # If there's only one read pair, there's no alignment to be done (and MAFFT won't accept it). aligned_seqs = [family[0]['seq'+mate]] else: # Do the multiple sequence alignment. aligned_seqs = make_msa(family, mate, aligner=aligner) # Transfer the alignment to the quality scores. ## Get a list of all quality scores in the family for this mate. quals_raw = [pair['qual'+mate] for pair in family] qual_alignment = seqtools.transfer_gaps_multi(quals_raw, aligned_seqs, gap_char_out=' ') # Package them up in the output data structure. alignment = [] for pair, aligned_seq, aligned_qual in zip(family, aligned_seqs, qual_alignment): alignment.append({'name':pair['name'+mate], 'seq':aligned_seq, 'qual':aligned_qual}) return alignment
def main(argv): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.set_defaults(**OPT_DEFAULTS) parser.add_argument('seqs', metavar='sequence', nargs='*', help='The alignment.') parser.add_argument('-i', '--input', help='Provide the sequences in this input file instead of as command-line arguments. ' 'Give "-" to use stdin.') parser.add_argument('-f', '--format', choices=('plain', 'duplex'), help='Input format. "plain" is a simple list of the sequences, one on each line. "duplex" is ' 'the 8-column format of the family-sorted read data from the duplex pipeline. It must be ' 'the read pairs from a single alpha/beta barcode combination (both the alpha-beta and ' 'beta-alpha strands). If "duplex" is given, you must also specify which of the four ' 'possible alignments to output with --mate and --order.') parser.add_argument('-m', '--mate', type=int, choices=(1, 2)) parser.add_argument('-o', '--order', choices=('ab', 'ba')) parser.add_argument('-F', '--qual-format', choices=('sanger',)) parser.add_argument('-q', '--qual', type=int, help='Quality threshold: Default: %(default)s') args = parser.parse_args(argv[1:]) qual_thres = ' ' if args.qual_format == 'sanger': qual_thres = chr(args.qual + 33) else: fail('Error: Unsupported FASTQ quality format "{}".'.format(args.qual_format)) # Check arguments. if not (args.seqs or args.input): fail('Error: You must provide sequences either in a file with --input or as arguments.') elif args.seqs and args.input: fail('Error: You cannot provide sequences in both a file and command-line arguments.') if args.format == 'duplex' and not (args.mate and args.order): fail('Error: If the --format is duplex, you must specify a --mate and --order.') # Read input. quals = [] if args.input: if args.format == 'plain': if args.input == '-': seqs = [line.strip() for line in sys.stdin] else: with open(args.input) as infile: seqs = [line.strip() for line in infile] elif args.format == 'duplex': if args.input == '-': (seqs, quals) = parse_duplex(sys.stdin, args.mate, args.order) else: with open(args.input) as infile: (seqs, quals) = parse_duplex(infile, args.mate, args.order) else: seqs = args.seqs align = make_msa(seqs) if quals: quals = seqtools.transfer_gaps_multi(quals, align, gap_char_out=' ') cons = consensus.get_consensus(align, quals, qual_thres=qual_thres, gapped=True) output = format_alignment(cons, align, quals, qual_thres=ord(qual_thres)) for seq in output: print seq