def filterMINCED(loaded_gff, local_location): colheaders = [ 'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ] repeat_df = pd.DataFrame(columns=colheaders) #genome_name = local_location.split('.')[0] for name, group in loaded_gff.groupby(by='CRISPR locus number'): results_list = list() lengths_list = list() unique_reps = list() uniq_rep_nums = list() repeat_num = 0 number_reps = len(group) if name == '': pass else: for index, row in group.iterrows(): contig_ID = row['seqid'] source = row['source'] type_col = row['type'] seq_start = row['start'] seq_end = row['end'] strand = row['strand'] phase = row['phase'] GeneOfInterest = row["DR number"] if 'CRISPR' in type_col: initial_start = row['start'] final_end = row['end'] #numDRs = row['score'] else: repeat_num += 1 LocationString = local_location + '.fna' for record in SeqIO.parse(open(LocationString), 'fasta'): if contig_ID in record.name: record.seq = record.seq[int(seq_start):int(seq_end )] record.id = GeneOfInterest record.name = GeneOfInterest results_list.append(record) lengths_list.append(len(record.seq)) if record.seq not in unique_reps: unique_reps.append(record.seq) uniq_rep_nums.append(repeat_num) repeat_Seq = record.seq if len(lengths_list) > 0: avgrep = np.mean(uniq_rep_nums) dist1st = avgrep - 1 distlast = number_reps - avgrep if distlast > dist1st: strand = '-' else: strand = '+' #print name, avgrep, number_reps, dist1st, distlast, strand if max(lengths_list) == min(lengths_list): alignment = MultipleSeqAlignment(results_list) summary_align = AlignInfo.SummaryInfo(alignment) consensus = summary_align.gap_consensus() if strand == '-': consensus = consensus.reverse_complement() summary_line = pd.DataFrame( columns=colheaders, data=[[ os.path.basename(local_location), source, type_col, initial_start, final_end, number_reps, strand, name.split('=')[-1], str(consensus) ]]) #return 'equal lengths' if 'ID' not in name: repeat_df = repeat_df.append(summary_line) else: pass else: consensus = repeat_Seq if strand == '-': consensus = consensus.reverse_complement() summary_line = pd.DataFrame( columns=colheaders, data=[[ os.path.basename(local_location), source, type_col, initial_start, final_end, number_reps, strand, name.split('=')[-1], str(consensus) ]]) if 'ID' not in name: repeat_df = repeat_df.append(summary_line) else: pass return repeat_df
args = parser.parse_args() genes = args.genes if type(args.genes)==list else [args.genes] translations = args.translations if type(args.translations)==list else [args.translations] T = Phylo.read(args.tree, 'newick') leafs = {n.name for n in T.get_terminals()} node_data = {} for gene, translation in zip(genes, translations): seqs = [] for s in SeqIO.parse(translation, 'fasta'): if s.id in leafs: seqs.append(s) tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa') tt.infer_ancestral_sequences(reconstruct_tip_states=True) with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh: for n in tt.tree.find_clades(): if n.name not in node_data: node_data[n.name] = {"aa_muts":{}} node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations] fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n") with open(args.output, 'w') as fh: json.dump({"nodes":node_data}, fh)
def write(sequences, handle, format): """Write complete set of sequences to a file. Arguments: - sequences - A list (or iterator) of SeqRecord objects, or (if using Biopython 1.54 or later) a single SeqRecord. - handle - File handle object to write to, or filename as string (note older versions of Biopython only took a handle). - format - lower case string describing the file format to write. Note if providing a file handle, your code should close the handle after calling this function (to ensure the data gets flushed to disk). Returns the number of records written (as an integer). """ from Bio import AlignIO # Try and give helpful error messages: if not isinstance(format, basestring): raise TypeError("Need a string for the file format (lower case)") if not format: raise ValueError("Format required (lower case string)") if format != format.lower(): raise ValueError("Format string '%s' should be lower case" % format) if isinstance(handle, SeqRecord): raise TypeError("Check arguments, handle should NOT be a SeqRecord") if isinstance(handle, list): # e.g. list of SeqRecord objects raise TypeError("Check arguments, handle should NOT be a list") if isinstance(sequences, SeqRecord): # This raised an exception in older versions of Biopython sequences = [sequences] if format in _BinaryFormats: mode = "wb" else: mode = "w" with as_handle(handle, mode) as fp: # Map the file format to a writer function/class if format in _FormatToString: format_function = _FormatToString[format] count = 0 for record in sequences: fp.write(format_function(record)) count += 1 elif format in _FormatToWriter: writer_class = _FormatToWriter[format] count = writer_class(fp).write_file(sequences) elif format in AlignIO._FormatToWriter: # Try and turn all the records into a single alignment, # and write that using Bio.AlignIO alignment = MultipleSeqAlignment(sequences) alignment_count = AlignIO.write([alignment], fp, format) if alignment_count != 1: raise RuntimeError( "Internal error - the underlying writer " "should have returned 1, not %r" % alignment_count ) count = len(alignment) del alignment_count, alignment elif format in _FormatToIterator or format in AlignIO._FormatToIterator: raise ValueError( "Reading format '%s' is supported, but not writing" % format ) else: raise ValueError("Unknown format '%s'" % format) if not isinstance(count, int): raise RuntimeError( "Internal error - the underlying %s writer " "should have returned the record count, not %r" % (format, count) ) return count
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration # Whitelisted headers we know about known_headers = [ "CLUSTAL", "PROBCONS", "MUSCLE", "MSAPROBS", "Kalign", "Biopython", ] if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known CLUSTAL header: %s" % (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0] == "(" and word[-1] == ")": word = word[1:-1] if word[0] in "0123456789": version = word break # There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() # If the alignment contains entries with the same sequence # identifier (not a good idea - but seems possible), then this # dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None # Used to extract the consensus # Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": # Sequences identifier... fields = line.rstrip().split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError(f"Could not parse line:\n{line}") ids.append(fields[0]) seqs.append(fields[1]) # Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( f"Could not parse line, bad sequence number:\n{line}" ) from None if len(fields[1].replace("-", "")) != letters: raise ValueError( f"Could not parse line, invalid sequence number:\n{line}" ) elif line[0] == " ": # Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() # Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: # No consensus break line = handle.readline() if not line: break # end of file assert line.strip() == "" assert seq_cols is not None # Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) # Loop over any remaining blocks... done = False while not done: # There should be a blank line between each block. # Also want to ignore any consensus line from the # previous block. while (not line) or line.strip() == "": line = handle.readline() if not line: break # end of file if not line: break # end of file if line.split(None, 1)[0] in known_headers: # Found concatenated alignment. self._header = line break for i in range(len(ids)): if line[0] == " ": raise ValueError(f"Unexpected line:\n{line!r}") fields = line.rstrip().split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError(f"Could not parse line:\n{line!r}") if fields[0] != ids[i]: raise ValueError( "Identifiers out of order? Got '%s' but expected '%s'" % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) if start != seq_cols.start: raise ValueError("Old location %s -> %i:XX" % (seq_cols, start)) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end # Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( f"Could not parse line, bad sequence number:\n{line}" ) from None if len(seqs[i].replace("-", "")) != letters: raise ValueError( f"Could not parse line, invalid sequence number:\n{line}" ) # Read in the next line line = handle.readline() # There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() # Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: raise StopIteration if (self.records_per_alignment is not None and self.records_per_alignment != len(ids)): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = (SeqRecord(Seq(s), id=i, description=i) for (i, s) in zip(ids, seqs)) alignment = MultipleSeqAlignment(records) # TODO - Handle alignment annotation better, for now # mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: alignment_length = len(seqs[0]) if len(consensus) != alignment_length: raise ValueError( "Alignment length is %i, consensus length is %i, '%s'" % (alignment_length, len(consensus), consensus)) alignment.column_annotations["clustal_consensus"] = consensus # For backward compatibility prior to .column_annotations: alignment._star_info = consensus return alignment
logger.info('Working on MSA file: ' + file) MSA_file = files[file] logger.info('Number of sequences in the MSA: ' + str(len(MSA_file))) assigned = [] unassigned = [] logger.info("Assigning traits to sequences for MSA") for x in MSA_file: if spec(x.id) in species_temp.keys(): assigned.append( SeqRecord(Seq(str(x.seq)), x.id + '|' + str(species_temp[spec(x.id)]), '', '')) else: unassigned.append(SeqRecord(Seq(str(x.seq)), x.id, '', '')) logger.info('Number of sequences with assigned traits: ' + str(len(assigned))) MSA_file = MultipleSeqAlignment(unassigned) AlignIO.write(MSA_file, file.split('.')[0] + "_unassigned.fa", "fasta") MSA_file = MultipleSeqAlignment(assigned) AlignIO.write(MSA_file, file.split('.')[0] + "_assigned.fa", "fasta") def temp(txt): a = txt.split('|') b = a[-1] return float(b) #retain only those sequences within the desired OGT range logger.info("Retaining only those sequences with trait in desired range") in_range = [] for x in MSA_file: for ranges in OGT_range:
def setUp(self): # Test set 1 seq1 = SeqRecord( Seq( "TCAGGGACTGCGAGAACCAAGCTACTGCTGCTGCTGGCTGCGCTCTGCGCCGCAGGTGGGGCGCTGGAG", ), id="pro1", ) seq2 = SeqRecord( Seq( "TCAGGGACTTCGAGAACCAAGCGCTCCTGCTGCTGGCTGCGCTCGGCGCCGCAGGTGGAGCACTGGAG", ), id="pro2", ) pro1 = SeqRecord(Seq("SGTARTKLLLLLAALCAAGGALE"), id="pro1") pro2 = SeqRecord(Seq("SGTSRTKRLLLLAALGAAGGALE"), id="pro2") aln1 = MultipleSeqAlignment([pro1, pro2]) self.aln1 = aln1 self.seqlist1 = [seq1, seq2] # Test set 2 # M K K H E L(F)L C Q G T S N K L T Q(L)L G T F E D H F L S L Q R M F N N C E V V seq3 = SeqRecord(Seq( "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC" ), id="pro1") # seq4 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAA TGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC'), id='pro2') seq4 = SeqRecord(Seq( "ATGAAAAAGCACGAGTTCTTTGCCAAGGGACAAGTAACAAGCTCACCCAGTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAATGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC" ), id="pro2") # seq5 =SeqRecord(Seq('ATGAAAAAGCACGAGTT CTTTGCCAAGGGACAAGTAACAAGCTCACCC TTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC'), id='pro3') seq5 = SeqRecord(Seq( "ATGAAAAAGCACGAGTTACTTTGCCAAGGGACAAGTAACAAGCTCACCCTTGGGCACTTTTGAAGACCACTTTCTGAGCCTACAGAGGATGTTCAACAACTGTGAGGTGGTCCTTGGGAATTTGGAAATTACCTACATGCAGAGTAGTTACAACCTTTCTTTTCTCAAGACCATCCAGGAGGTTGCCGGCTATGTACTCATTGCCCTC" ), id="pro3") pro3 = SeqRecord(Seq( "MKKHELLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL" ), id="pro1") pro4 = SeqRecord(Seq( "MKKHEFLCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL" ), id="pro2") pro5 = SeqRecord(Seq( "MKKHELLCQGTSNKLTLLGTFEDHFLSLQRMFNNCEVVLGNLEITYMQSSYNLSFLKTIQEVAGYVLIAL" ), id="pro3") aln2 = MultipleSeqAlignment([pro3, pro4, pro5]) self.aln2 = aln2 self.seqlist2 = [seq3, seq4, seq5] # Test set 3 # use Yeast mitochondrial codon table seq6 = SeqRecord(Seq( "ATGGCAAGGGACCACCCAGTTGGGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACCTTTCTTTTCTCAAGACCATCCAG" ), id="pro6") seq7 = SeqRecord(Seq( "ATGGCAAGGCACCATCCAGTTGAGCACTGATATGATCGGGTGTATTTGCAGAGTAGTAACGTGTCTCTGCTCAAGACCATCCAG" ), id="pro7") seq8 = SeqRecord(Seq( "ATGGCAGGGGACCACCCAGTTGGGCACTGATATGATCGTGTGTATCTGCAGAGTAGTAACCACTCTTTTCTCATGACCATCCAG" ), id="pro8") pro6 = SeqRecord(Seq("MARDHPVGHWYDRVYLQSSNTSFTKTIQ"), id="pro6") pro7 = SeqRecord(Seq("MARHHPVEHWYDRVYLQSSNVSTTKTIQ"), id="pro7") pro8 = SeqRecord(Seq("MAGDHPVGHWYDRVYTQSSNHSFTMTIQ"), id="pro8") aln3 = MultipleSeqAlignment([pro6, pro7, pro8]) self.aln3 = aln3 self.seqlist3 = [seq6, seq7, seq8] self.codontable3 = CodonTable.unambiguous_dna_by_id[3]
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration line = line.strip() parts = [x for x in line.split() if x] if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") from None assert self._is_header(line) if (self.records_per_alignment is not None and self.records_per_alignment != number_of_seqs): raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] # By default, expects STRICT truncation / padding to 10 characters. # Does not require any whitespace between name and seq. for i in range(number_of_seqs): line = handle.readline().rstrip() sequence_id, s = self._split_id(line) ids.append(sequence_id) while len(s) < length_of_seqs: # The sequence may be split into multiple lines line = handle.readline().strip() if not line: break if line == "": continue s = "".join([s, line.strip().replace(" ", "")]) if len(s) > length_of_seqs: raise ValueError("Found a record of length %i, " "should be %i" % (len(s), length_of_seqs)) if "." in s: raise ValueError(_NO_DOTS) seqs.append(s) while True: # Find other alignments in the file line = handle.readline() if not line: break if self._is_header(line): self._header = line break records = (SeqRecord(Seq(s), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)) return MultipleSeqAlignment(records)
items = line.strip('\n').split() chrom = items[0] pos = items[1] REF = items[3] ALTs = items[4].split(',') alleles = tuple([REF] + ALTs) GTs = items[9:] for x, sample in enumerate(samples): # print(i) seq_dict[sample + '_1'][i] = alleles[int(GTs[x].split('|')[0])] seq_dict[sample + '_2'][i] = alleles[int(GTs[x].split('|')[1])] positions[i] = pos i += 1 # print(i) alignment = MultipleSeqAlignment([ SeqRecord(Seq(''.join(y), generic_dna), id=x, description='') for x, y in seq_dict.items() ]) AlignIO.write(alignment, args.output_prefix + '.fa', "fasta") if args.output_positions: with open(args.output_prefix + '.pos', 'wt') as f_out: f_out.write('\n'.join(positions) + '\n') f_out.close() #
def run(args): metadata, columns = read_metadata(args.metadata) dates = get_numerical_dates(metadata, fmt='%Y-%m-%d') stiffness = args.stiffness inertia = args.inertia if args.method == "kde": # Load weights if they have been provided. if args.weights: with open(args.weights, "r") as fh: weights = json.load(fh) weights_attribute = args.weights_attribute else: weights = None weights_attribute = None if args.tree: tree = Phylo.read(args.tree, 'newick') tps = [] for tip in tree.get_terminals(): tip.attr = {"num_date": np.mean(dates[tip.name])} tps.append(tip.attr["num_date"]) # Annotate tips with metadata to enable filtering and weighting of # frequencies by metadata attributes. for key, value in metadata[tip.name].items(): tip.attr[key] = value if args.method == "diffusion": # estimate tree frequencies pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date) frequency_dict = {"pivots":format_frequencies(pivots)} frequency_dict["counts"] = {} for region in args.regions: # Omit strains sampled prior to the first pivot from frequency calculations. # (these tend to be reference strains included for phylogenetic context) if region=='global': node_filter_func = lambda node: node.attr["num_date"] >= pivots[0] else: node_filter_func = lambda node: (node.attr["region"] == region and node.attr["num_date"] >= pivots[0]) tree_freqs = tree_frequencies(tree, pivots, method='SLSQP', node_filter = node_filter_func, ws = max(2, tree.count_terminals()//10), stiffness = stiffness, inertia=inertia, min_clades=args.minimal_clade_size_to_estimate) tree_freqs.estimate_clade_frequencies() frequency_dict["counts"][region] = [int(x) for x in tree_freqs.counts] if args.output_format == "nextflu": # Export frequencies in nextflu-format by region and clade id. for clade_id, clade_frequencies in tree_freqs.frequencies.items(): frequency_dict["%s_clade:%d" % (region, clade_id)] = format_frequencies(clade_frequencies) else: # Export frequencies in auspice-format by strain name. for node in tree.find_clades(order='postorder'): if node.is_terminal(): node.tipcount=1 else: node.tipcount = np.sum([c.tipcount for c in node]) if (node.is_terminal() or args.include_internal_nodes) and node.tipcount>args.minimal_clade_size: if node.name not in frequency_dict: frequency_dict[node.name] = {} frequency_dict[node.name][region] = format_frequencies(tree_freqs.frequencies[node.clade]) elif args.method == "kde": if args.output_format == "nextflu": print("ERROR: nextflu format is not supported for KDE frequencies", file=sys.stderr) return 1 # Estimate frequencies. kde_frequencies = TreeKdeFrequencies( sigma_narrow=args.narrow_bandwidth, sigma_wide=args.wide_bandwidth, proportion_wide=args.proportion_wide, pivot_frequency=args.pivot_interval, start_date=args.min_date, end_date=args.max_date, weights=weights, weights_attribute=weights_attribute, include_internal_nodes=args.include_internal_nodes, censored=args.censored ) frequencies = kde_frequencies.estimate(tree) # Export frequencies in auspice-format by strain name. frequency_dict = {"pivots": list(kde_frequencies.pivots)} for node_name in frequencies: frequency_dict[node_name] = { "frequencies": format_frequencies(frequencies[node_name]) } write_json(frequency_dict, args.output) print("tree frequencies written to", args.output, file=sys.stdout) elif args.alignments: frequencies = None for gene, fname in zip(args.gene_names, args.alignments): if not os.path.isfile(fname): print("ERROR: alignment file not found", file=sys.stderr) return 1 aln = MultipleSeqAlignment([seq for seq in AlignIO.read(fname, 'fasta') if not seq.name.startswith('NODE_')]) tps = np.array([np.mean(dates[seq.name]) for seq in aln]) if frequencies is None: pivots = get_pivots(tps, args.pivot_interval, args.min_date, args.max_date) frequencies = {"pivots":format_frequencies(pivots)} if args.method == "kde": kde_frequencies = AlignmentKdeFrequencies( sigma_narrow=args.narrow_bandwidth, sigma_wide=args.wide_bandwidth, proportion_wide=args.proportion_wide, pivot_frequency=args.pivot_interval, start_date=args.min_date, end_date=args.max_date, weights=weights, weights_attribute=weights_attribute, include_internal_nodes=args.include_internal_nodes, censored=args.censored ) kde_frequencies.estimate( aln, tps ) for mutation, mutation_frequencies in kde_frequencies.frequencies.items(): position, state = mutation.split(":") frequencies["%s:%s%s" % (gene, position, state)] = format_frequencies(mutation_frequencies) else: freqs = alignment_frequencies(aln, tps, pivots, stiffness=stiffness, inertia=inertia, method='SLSQP', dtps=2.0) freqs.mutation_frequencies(min_freq = args.minimal_frequency, ignore_char=args.ignore_char) frequencies.update({"%s:%d%s" % (gene, pos+1, state): format_frequencies(mutation_frequencies) for (pos, state), mutation_frequencies in freqs.frequencies.items()}) frequencies["%s:counts" % gene] = [int(observations_per_pivot) for observations_per_pivot in freqs.counts] write_json(frequencies, args.output) print("mutation frequencies written to", args.output, file=sys.stdout)
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration # Whitelisted headers we know about. known_headers = [ "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp" ] # Examples in "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 # would often start as follows: # # !!AA_MUTIPLE_ALIGNMENT 1.0 # PileUp of: @/usr/users2/culhane/... # # etc with other seemingly free format text before getting to the # MSF/Type/Check line and the following Name: lines block and // line. # # MUSCLE just has a line "PileUp", while other sources just use the line # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" # (nucleotide). if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known GCG MSF header: %s" % (line.strip().split()[0], ", ".join(known_headers))) while line and " MSF: " not in line: line = handle.readline() if not line: raise ValueError( "Reached end of file without MSF/Type/Check header line") # Quoting from "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. # Page 31: # # "Header information is before a .. (double dot) in a GCG format file. # The file will also have a checksum specific for that file." # # This was followed by a single non-aligned sequence, but this convention # appears to also be used in the GCG MSF files. Quoting other examples in # this reference, page 31: # # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. # # Except from page 148: # # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. # # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: # # MSF: 689 Type: N Check: 0000 .. # # By observation, the MSF value is the column count, type is N (nucleotide) # or P (protein / amino acid). # # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, # # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf # !!NA_MULTIPLE_ALIGNMENT 1.0 # # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. # # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 # Name: G29385 Len: 633 Check: 391 Weight: 1.00 # # // # parts = line.strip("\n").split() offset = parts.index("MSF:") if (parts[offset + 2] != "Type:" or parts[-3] not in ("Check:", "CompCheck:") or parts[-1] != ".."): raise ValueError( "GCG MSF header line should be " "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', " " not: %r" % line) try: aln_length = int(parts[offset + 1]) except ValueError: aln_length = -1 if aln_length < 0: raise ValueError( "GCG MSF header line should have MDF: <int> for column count, not %r" % parts[offset + 1]) seq_type = parts[offset + 3] if seq_type not in ["P", "N"]: raise ValueError( "GCG MSF header line should have 'Type: P' (protein) " "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type) # There should be a blank line after that header line, then the Name: lines # # In a possible bug, T-COFFEE v12.00 adds 'oo' after the names, as shown here, # # PileUp # # # # MSF: 628 Type: P Check: 147 .. # # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 # # // ids = [] lengths = [] checks = [] weights = [] line = handle.readline() while line and line.strip() != "//": line = handle.readline() if line.strip().startswith("Name: "): if " Len: " in line and " Check: " in line and " Weight: " in line: rest = line[line.index("Name: ") + 6:].strip() name, rest = rest.split(" Len: ") length, rest = rest.split(" Check: ") check, weight = rest.split(" Weight: ") name = name.strip() if name.endswith(" oo"): # T-COFFEE oddity, ignore this name = name[:-3] if name in ids: raise ValueError("Duplicated ID of %r" % name) if " " in name: raise NotImplementedError("Space in ID %r" % name) ids.append(name) # Expect aln_length <= int(length.strip()), see below lengths.append(int(length.strip())) checks.append(int(check.strip())) weights.append(float(weight.strip())) else: raise ValueError("Malformed GCG MSF name line: %r" % line) if not line: raise ValueError( "End of file while looking for end of header // line.") if aln_length != max(lengths): # In broken examples from IMGTHLA was possible to continue # https://github.com/ANHIG/IMGTHLA/issues/201 max_length = max(lengths) max_count = sum(1 for _ in lengths if _ == max_length) raise ValueError( "GCG MSF header said alignment length %i, but %s of %i sequences said Len: %s" % (aln_length, max_count, len(ids), max_length)) line = handle.readline() if not line: raise ValueError("End of file after // line, expected sequences.") if line.strip(): raise ValueError( "After // line, expected blank line before sequences.") # Now load the sequences seqs = [[] for _ in ids] # list of empty lists completed_length = 0 while completed_length < aln_length: # Note might have a coordinate header line (seems to be optional) for idx, name in enumerate(ids): line = handle.readline() if idx == 0 and not line.strip(): # T-COFFEE uses two blank lines between blocks, rather than one while line and not line.strip(): line = handle.readline() if not line: raise ValueError( "End of file where expecting sequence data.") # print("Looking for seq for %s in line: %r" % (name, line)) words = line.strip().split() # Should we use column numbers, rather than assuming no spaces in names? if idx == 0 and words and words[0] != name: # print("Actually have a coord line") # Hopefully this is a coordinate header before the first seq try: i = int(words[0]) except ValueError: i = -1 if i != completed_length + 1: raise ValueError( "Expected GCG MSF coordinate line starting %i, got: %r" % (completed_length + 1, line)) if len(words) > 1: # Final block usually not full 50 chars, so expect start only. if len(words) != 2: i = -1 else: try: i = int(words[1]) except ValueError: i = -1 if i != (completed_length + 50 if completed_length + 50 < aln_length else aln_length): raise ValueError( "Expected GCG MSF coordinate line %i to %i, got: %r" % ( completed_length + 1, completed_length + 50 if completed_length + 50 < aln_length else aln_length, line, )) line = handle.readline() words = line.strip().split() # print("Still looking for seq for %s in line: %r" % (name, line)) # Dealt with any coordinate header line, should now be sequence if not words: # Should be sequence here, but perhaps its a short one? if (lengths[idx] < aln_length and len("".join(seqs[idx])) == lengths[idx]): # Is this actually allowed in the format? Personally I would # expect a line with name and a block of trailing ~ here. pass else: raise ValueError("Expected sequence for %s, got: %r" % (name, line)) elif words[0] == name: assert len(words) > 1, line # print(i, name, repr(words)) seqs[idx].extend(words[1:]) else: raise ValueError("Expected sequence for %r, got: %r" % (name, line)) # TODO - check the sequence lengths thus far are consistent # with blocks of 50? completed_length += 50 line = handle.readline() if line.strip(): raise ValueError("Expected blank line, got: %r" % line) # Skip over any whitespace at the end... while True: line = handle.readline() if not line: # End of file, no more alignments break elif not line.strip(): # Blank line, ignore pass elif line.strip().split()[0] in known_headers: # Looks like the start of another alignment: self._header = line break else: raise ValueError( "Unexpected line after GCG MSF alignment: %r" % line) # Combine list of strings into single string, remap gaps seqs = ["".join(s).replace("~", "-").replace(".", "-") for s in seqs] # Apply any trailing padding for short sequences padded = False for idx, (length, s) in enumerate(zip(lengths, seqs)): if len(s) < aln_length and len(s) == length: padded = True seqs[idx] = s + "-" * (aln_length - len(s)) if padded: import warnings from Bio import BiopythonParserWarning warnings.warn( "One of more alignment sequences were truncated and have been gap padded", BiopythonParserWarning, ) records = (SeqRecord( Seq(s), id=i, name=i, description=i, annotations={"weight": w}, ) for (i, s, w) in zip(ids, seqs, weights)) # This will check alignment lengths are self-consistent: align = MultipleSeqAlignment(records) # Check matches the header: if align.get_alignment_length() != aln_length: raise ValueError( "GCG MSF headers said alignment length %i, but have %i" % (aln_length, align.get_alignment_length())) return align
a.close() command = '/usr/texbin/pdflatex --file-line-error --synctex=1 -output-directory=%s --save-size=10000 %s/align.tex > /dev/null' % ( TEMP_DIR, TEMP_DIR) print('Launcning command:') print(command) os.system(command) os.system('mv ' + TEMP_DIR + '/align.pdf %s.pdf' % title.replace(' ', '_')) #prof=cons_prof(alignment) #pylab.plot(prof) if __name__ == '__main__': human_h2a_z_core = Seq( 'SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLI-KATIAGGGVIPHIHKSLIG' ) xenopus_h2a_core = Seq( 'TRSSRAGLQFPVGRVHRLLRKGNYAE-RVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLP' ) # human_h2a_z_core=Seq('SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLIKATIAGGGVIPHIHKSLIG') get_pdf( 'H2A', MultipleSeqAlignment([ SeqRecord(xenopus_h2a_core, id='H2A', name='H2A'), SeqRecord(human_h2a_z_core, id='H2A.Z', name='H2A.Z') ]), 'H2AvsH2A.Z', [0, 5, 1], True, True) # get_pdf('H2A',MultipleSeqAlignment([SeqRecord(human_h2a_z_core,id='H2A',name='H2A'),SeqRecord(human_h2a_z_core,id='1H2A.Z',name='H2A.Z')]),'H2AvsH2A.Z',[0,5,1])
def get_spliced(self, starts, ends, strand=1): """Return a multiple alignment of the exact sequence range provided. Accepts two lists of start and end positions on target_seqname, representing exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the desired sequences spliced together. *starts* should be a list of 0-based start coordinates of segments in the reference. *ends* should be the list of the corresponding segment ends (in the half-open UCSC convention: http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). To ask for the alignment portion corresponding to the first 100 nucleotides of the reference sequence, you would use ``search([0], [100])`` """ # validate strand if strand not in (1, -1): raise ValueError("Strand must be 1 or -1, got %s" % str(strand)) # pull all alignments that span the desired intervals fetched = [multiseq for multiseq in self.search(starts, ends)] # keep track of the expected letter count # (sum of lengths of [start, end) segments, # where [start, end) half-open) expected_letters = sum( [end - start for start, end in zip(starts, ends)]) # if there's no alignment, return filler for the assembly of the length given if len(fetched) == 0: return MultipleSeqAlignment([ SeqRecord(Seq("N" * expected_letters), id=self._target_seqname) ]) # find the union of all IDs in these alignments all_seqnames = set( [sequence.id for multiseq in fetched for sequence in multiseq]) # split every record by base position # key: sequence name # value: dictionary # key: position in the reference sequence # value: letter(s) (including letters # aligned to the "-" preceding the letter # at the position in the reference, if any) split_by_position = dict([(seq_name, {}) for seq_name in all_seqnames]) # keep track of what the total number of (unspliced) letters should be total_rec_length = 0 # track first strand encountered on the target seqname ref_first_strand = None for multiseq in fetched: # find the target_seqname in this MultipleSeqAlignment and use it to # set the parameters for the rest of this iteration for seqrec in multiseq: if seqrec.id == self._target_seqname: try: if ref_first_strand is None: ref_first_strand = seqrec.annotations["strand"] if ref_first_strand not in (1, -1): raise ValueError("Strand must be 1 or -1") elif ref_first_strand != seqrec.annotations["strand"]: raise ValueError( "Encountered strand='%s' on target seqname, " "expected '%s'" % (seqrec.annotations["strand"], ref_first_strand)) except KeyError: raise ValueError( "No strand information for target seqname (%s)" % self._target_seqname) # length including gaps (i.e. alignment length) rec_length = len(seqrec) rec_start = seqrec.annotations["start"] rec_end = seqrec.annotations["start"] + seqrec.annotations[ "size"] total_rec_length += rec_end - rec_start # blank out these positions for every seqname for seqrec in multiseq: for pos in range(rec_start, rec_end): split_by_position[seqrec.id][pos] = "" break else: raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname, )) # the true, chromosome/contig/etc position in the target seqname real_pos = rec_start # loop over the alignment to fill split_by_position for gapped_pos in range(0, rec_length): for seqrec in multiseq: # keep track of this position's value for the target seqname if seqrec.id == self._target_seqname: track_val = seqrec.seq[gapped_pos] # Here, a real_pos that corresponds to just after a series of "-" # in the reference will "accumulate" the letters found in other sequences # in front of the "-"s split_by_position[ seqrec.id][real_pos] += seqrec.seq[gapped_pos] # increment the real_pos counter only when non-gaps are found in # the target_seqname, and we haven't reached the end of the record if track_val != "-" and real_pos < rec_end - 1: real_pos += 1 # make sure the number of bp entries equals the sum of the record lengths if len(split_by_position[self._target_seqname]) != total_rec_length: raise ValueError( "Target seqname (%s) has %s records, expected %s" % (self._target_seqname, len(split_by_position[self._target_seqname]), total_rec_length)) # translates a position in the target_seqname sequence to its gapped length realpos_to_len = dict([ (x, len(y)) for x, y in split_by_position[self._target_seqname].items() if len(y) > 1 ]) # splice together the exons subseq = {} for seqid in all_seqnames: seq_split = split_by_position[seqid] seq_splice = [] filler_char = "N" if seqid == self._target_seqname else "-" # iterate from start to end, taking bases from split_by_position when # they exist, using N or - for gaps when there is no alignment. append = seq_splice.append for exonstart, exonend in zip(starts, ends): for real_pos in range(exonstart, exonend): # if this seqname has this position, add it if real_pos in seq_split: append(seq_split[real_pos]) # if not, but it's in the target_seqname, add length-matched filler elif real_pos in realpos_to_len: append(filler_char * realpos_to_len[real_pos]) # it's not in either, so add a single filler character else: append(filler_char) subseq[seqid] = "".join(seq_splice) # make sure we're returning the right number of letters if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: raise ValueError( "Returning %s letters for target seqname (%s), expected %s" % (len(subseq[self._target_seqname].replace( "-", "")), self._target_seqname, expected_letters)) # check to make sure all sequences are the same length as the target seqname ref_subseq_len = len(subseq[self._target_seqname]) for seqid, seq in subseq.items(): if len(seq) != ref_subseq_len: raise ValueError("Returning length %s for %s, expected %s" % (len(seq), seqid, ref_subseq_len)) # finally, build a MultipleSeqAlignment object for our final sequences result_multiseq = [] for seqid, seq in subseq.items(): seq = Seq(seq) seq = seq if strand == ref_first_strand else seq.reverse_complement( ) result_multiseq.append( SeqRecord(seq, id=seqid, name=seqid, description="")) return MultipleSeqAlignment(result_multiseq)
def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet): """Iterate over a MAF file handle as MultipleSeqAlignment objects. Iterates over lines in a MAF file-like object (handle), yielding MultipleSeqAlignment objects. SeqRecord IDs generally correspond to species names. """ in_a_bundle = False annotations = [] records = [] while True: # allows parsing of the last bundle without duplicating code try: line = next(handle) except StopIteration: line = "" if in_a_bundle: if line.startswith("s"): # add a SeqRecord to the bundle line_split = line.strip().split() if len(line_split) != 7: raise ValueError( "Error parsing alignment - 's' line must have 7 fields" ) # convert MAF-style +/- strand to biopython-type 1/-1 if line_split[4] == "+": strand = 1 elif line_split[4] == "-": strand = -1 else: # TODO: issue warning, set to 0? strand = 1 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) anno = { "start": int(line_split[2]), "size": int(line_split[3]), "strand": strand, "srcSize": int(line_split[5]) } sequence = line_split[6] # interpret a dot/period to mean the same as the first sequence if "." in sequence: if not records: raise ValueError( "Found dot/period in first sequence of alignment") ref = str(records[0].seq) new = [] for (letter, ref_letter) in zip(sequence, ref): new.append(ref_letter if letter == "." else letter) sequence = "".join(new) records.append( SeqRecord(Seq(sequence, alphabet), id=line_split[1], name=line_split[1], description="", annotations=anno)) elif line.startswith("i"): # TODO: information about what is in the aligned species DNA before # and after the immediately preceding "s" line pass elif line.startswith("e"): # TODO: information about the size of the gap between the alignments # that span the current block pass elif line.startswith("q"): # TODO: quality of each aligned base for the species. # Need to find documentation on this, looks like ASCII 0-9 or gap? # Can then store in each SeqRecord's .letter_annotations dictionary, # perhaps as the raw string or turned into integers / None for gap? pass elif line.startswith("#"): # ignore comments # (not sure whether comments # are in the maf specification, though) pass elif not line.strip(): # end a bundle of records if seq_count is not None: assert len(records) == seq_count alignment = MultipleSeqAlignment(records, alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/FastaIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = annotations yield alignment in_a_bundle = False annotations = [] records = [] else: raise ValueError( "Error parsing alignment - unexpected line:\n%s" % (line, )) elif line.startswith("a"): # start a bundle of records in_a_bundle = True annot_strings = line.strip().split()[1:] if len(annot_strings) != line.count("="): raise ValueError( "Error parsing alignment - invalid key in 'a' line") annotations = dict( [a_string.split("=") for a_string in annot_strings]) elif line.startswith("#"): # ignore comments pass elif not line: break
def createAlignment(sequences, alphabet): """Create an Alignment object from a list of sequences""" return MultipleSeqAlignment( (SeqRecord(Seq(s, alphabet), id="sequence%i" % (i + 1)) for (i, s) in enumerate(sequences)), alphabet)
def mview_linkage(seq_file, plot_title, mafft_exe, mview_exe): # define file name seq_path, seq_basename, seq_ext = sep_path_basename_ext(seq_file) msa_file = '%s/%s.aln' % (seq_path, seq_basename) msa_file_mview = '%s/%s_MView.aln' % (seq_path, seq_basename) msa_file_mviewd_html_tmp = '%s/%s_MView_tmp.html' % (seq_path, seq_basename) msa_file_mviewd_html = '%s/%s_MView.html' % (seq_path, seq_basename) gap_char = ' ' break_line_char = '=' # align sequences mafft_cmd = '%s --quiet --retree 1 %s > %s' % (mafft_exe, seq_file, msa_file) os.system(mafft_cmd) mapped_reads_dict = {} ref_id = '' ref_seq = '' current_line = 0 for each_seq in AlignIO.read(msa_file, "fasta"): seq_id = each_seq.id seq_seq = str(each_seq.seq).upper() if current_line == 0: ref_id = seq_id ref_seq = seq_seq else: seq_id_base = '.'.join(seq_id.split('.')[:-1]) seq_id_strand = seq_id.split('.')[-1] if seq_id_base not in mapped_reads_dict: if seq_id_strand == '1': mapped_reads_dict[seq_id_base] = [seq_seq, ''] if seq_id_strand == '2': mapped_reads_dict[seq_id_base] = ['', seq_seq] else: if seq_id_strand == '1': mapped_reads_dict[seq_id_base][0] = seq_seq if seq_id_strand == '2': mapped_reads_dict[seq_id_base][1] = seq_seq current_line += 1 # create an empty list to hold all sequences in a msa seq_record_list = [] # add ref_seq to seq_record_list align_record_mview = MultipleSeqAlignment([]) ref_seq_split_by_n = ref_seq.split('N') ref_seq_split_by_n_updated = [] for segment in ref_seq_split_by_n: if ('-' in segment) and (segment == '-' * len(segment)): segment = 'N' * len(segment) ref_seq_split_by_n_updated.append(segment) ref_seq_updated = 'N'.join(ref_seq_split_by_n_updated) seq_record_list.append( SeqRecord(Seq(ref_seq_updated), id=ref_id, description='Reference')) # add break line seq_record_list.append( SeqRecord(Seq(break_line_char * len(ref_seq_updated)), id='#', description='')) # add paired reads to seq_record_list singleton_dict = {} overlapping_reads_dict = {} for each_read_base in mapped_reads_dict: r1_seq = mapped_reads_dict[each_read_base][0] r2_seq = mapped_reads_dict[each_read_base][1] if (r1_seq == '') or (r2_seq == ''): if not ((r1_seq == '') and (r2_seq == '')): singleton_dict[each_read_base] = mapped_reads_dict[ each_read_base] else: overlapping_bps = 0 for bp1, bp2 in zip(r1_seq, r2_seq): if (bp1 != '-') and (bp2 != '-'): overlapping_bps += 1 if overlapping_bps > 0: overlapping_reads_dict[each_read_base] = mapped_reads_dict[ each_read_base] else: merge_r1_r2 = '' for bp1, bp2 in zip(r1_seq, r2_seq): if (bp1 == '-') and (bp2 == '-'): merge_r1_r2 += '-' if (bp1 != '-') and (bp2 == '-'): merge_r1_r2 += bp1 if (bp1 == '-') and (bp2 != '-'): merge_r1_r2 += bp2 seq_record_list.append( SeqRecord(Seq(merge_r1_r2), id=each_read_base, description='In_Pair')) # add break line seq_record_list.append( SeqRecord(Seq(break_line_char * len(ref_seq_updated)), id='#', description='')) # add overlapped reads to seq_record_list for each_overlapping_reads in overlapping_reads_dict: r1_id = '%s.1' % each_overlapping_reads r2_id = '%s.2' % each_overlapping_reads r1_seq = overlapping_reads_dict[each_overlapping_reads][0] r2_seq = overlapping_reads_dict[each_overlapping_reads][1] seq_record_list.append( SeqRecord(Seq(r1_seq), id=r1_id, description='Overlapped')) seq_record_list.append( SeqRecord(Seq(r2_seq), id=r2_id, description='Overlapped')) # add break line seq_record_list.append( SeqRecord(Seq(break_line_char * len(ref_seq_updated)), id='#', description='')) # add singleton to seq_record_list for each_singleton in singleton_dict: r1_id = '%s.1' % each_singleton r2_id = '%s.2' % each_singleton r1_seq = singleton_dict[each_singleton][0] r2_seq = singleton_dict[each_singleton][1] if (r1_seq != '') and (r2_seq == ''): seq_record_list.append( SeqRecord(Seq(r1_seq), id=r1_id, description='Unpaired')) if (r1_seq == '') and (r2_seq != ''): seq_record_list.append( SeqRecord(Seq(r2_seq), id=r2_id, description='Unpaired')) # add break line seq_record_list.append( SeqRecord(Seq(break_line_char * len(ref_seq_updated)), id='#', description='')) # get updated msa record align_record_mview = MultipleSeqAlignment(seq_record_list) # write out updated msa msa_file_mview_handle = open(msa_file_mview, 'w') AlignIO.write(align_record_mview, msa_file_mview_handle, 'fasta') msa_file_mview_handle.close() # visualize update msa # coloring: any,identity,mismatch,consensus,group mview_parameter_str = '-in fasta -moltype dna -colormap CLUSTAL_NUC -coloring any -css on -html head -ruler off -label0 -label4 -label5 -gap "%s"' % gap_char mview_cmd = '%s %s -title %s %s > %s' % (mview_exe, mview_parameter_str, plot_title, msa_file_mview, msa_file_mviewd_html_tmp) os.system(mview_cmd) msa_file_mviewd_html_handle = open(msa_file_mviewd_html, 'w') for each_line in open(msa_file_mviewd_html_tmp): if not (('Reference sequence' in each_line) or ('Colored by' in each_line)): msa_file_mviewd_html_handle.write(each_line) msa_file_mviewd_html_handle.close()
parser.add_argument("-o", dest='output', type=argparse.FileType('w+'), default=sys.stdout) args = parser.parse_args() in_sequences = { x.id: x for x in AlignIO.read(args.input_real, phylo.FASTA) } sim_in_sequences = { x.id: x for x in AlignIO.read(args.input_sim, phylo.FASTA) } real_tree = phylo.build_phylogenetic_tree( MultipleSeqAlignment(in_sequences.values())) N = len(in_sequences) if N < args.min_tree_size: sys.stderr.write("Tree size is too small!\n") sys.exit(-1) try: real_tree.root_at_midpoint() except: sys.stderr.write("Error processing tree!\n") sys.exit(-1) p_functions = [ phylo.get_sackin_index, phylo.get_colless_index, phylo.count_cherries,
def FastaM10Iterator(handle, alphabet=single_letter_alphabet): """Alignment iterator for the FASTA tool's pairwise alignment output. This is for reading the pairwise alignments output by Bill Pearson's FASTA program when called with the -m 10 command line option for machine readable output. For more details about the FASTA tools, see the website http://fasta.bioch.virginia.edu/ and the paper: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 This class is intended to be used via the Bio.AlignIO.parse() function by specifying the format as "fasta-m10" as shown in the following code: from Bio import AlignIO handle = ... for a in AlignIO.parse(handle, "fasta-m10"): assert len(a) == 2, "Should be pairwise!" print "Alignment length %i" % a.get_alignment_length() for record in a: print record.seq, record.name, record.id Note that this is not a full blown parser for all the information in the FASTA output - for example, most of the header and all of the footer is ignored. Also, the alignments are not batched according to the input queries. Also note that there can be up to about 30 letters of flanking region included in the raw FASTA output as contextual information. This is NOT part of the alignment itself, and is not included in the resulting MultipleSeqAlignment objects returned. """ if alphabet is None: alphabet = single_letter_alphabet state_PREAMBLE = -1 state_NONE = 0 state_QUERY_HEADER = 1 state_ALIGN_HEADER = 2 state_ALIGN_QUERY = 3 state_ALIGN_MATCH = 4 state_ALIGN_CONS = 5 def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" \ % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" #Just for printing len(q) in debug below m = "?" #Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordindates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError, err: print "Darn... amino acids vs nucleotide coordinates?" print tool print query_seq print query_tags print q, len(q) print match_seq print match_tags print m, len(m) print handle.name raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def pairwise_sequence_alignment(a_seq, b_seq, engine, a_seq_id=None, b_seq_id=None, gapopen=10, gapextend=0.5, outfile=None, outdir=None, force_rerun=False): """Run a global pairwise sequence alignment between two sequence strings. Args: a_seq (str, Seq, SeqRecord, SeqProp): Reference sequence b_seq (str, Seq, SeqRecord, SeqProp): Sequence to be aligned to reference engine (str): `biopython` or `needle` - which pairwise alignment program to use a_seq_id (str): Reference sequence ID. If not set, is "a_seq" b_seq_id (str): Sequence to be aligned ID. If not set, is "b_seq" gapopen (int): Only for `needle` - Gap open penalty is the score taken away when a gap is created gapextend (float): Only for `needle` - Gap extension penalty is added to the standard gap penalty for each base or residue in the gap outfile (str): Only for `needle` - name of output file. If not set, is {id_a}_{id_b}_align.txt outdir (str): Only for `needle` - Path to output directory. Default is the current directory. force_rerun (bool): Only for `needle` - Default False, set to True if you want to rerun the alignment if outfile exists. Returns: MultipleSeqAlignment: Biopython object to represent an alignment """ engine = engine.lower() if engine not in ['biopython', 'needle']: raise ValueError('{}: invalid engine'.format(engine)) if not a_seq_id: a_seq_id = 'a_seq' if not b_seq_id: b_seq_id = 'b_seq' a_seq = ssbio.protein.sequence.utils.cast_to_str(a_seq) b_seq = ssbio.protein.sequence.utils.cast_to_str(b_seq) if engine == 'biopython': # TODO: allow different matrices? needle uses blosum62 by default, how to change that? # TODO: how to define gap open/extend when using matrix in biopython global alignment? log.warning('Gap penalties not implemented in Biopython yet') blosum62 = matlist.blosum62 alignments = pairwise2.align.globaldx(a_seq, b_seq, blosum62) # TODO: add gap penalties best_alignment = alignments[0] a = ssbio.protein.sequence.utils.cast_to_seq_record(best_alignment[0], id=a_seq_id) b = ssbio.protein.sequence.utils.cast_to_seq_record(best_alignment[1], id=b_seq_id) alignment = MultipleSeqAlignment([a, b], annotations={"score": best_alignment[2], "start": best_alignment[3], "end" : best_alignment[4]}) alignment.annotations['percent_identity'] = get_percent_identity(best_alignment[0], best_alignment[1]) * 100 return alignment if engine == 'needle': alignment_file = run_needle_alignment(seq_a=a_seq, seq_b=b_seq, gapopen=gapopen, gapextend=gapextend, outdir=outdir, outfile=outfile, force_rerun=force_rerun) log.debug('Needle alignment at {}'.format(alignment_file)) if not op.exists(alignment_file): raise ValueError('{}: needle alignment file does not exist'.format(alignment_file)) # Use AlignIO to parse the needle alignment, alignments[0] is the first alignment (the only one in pairwise) alignments = list(AlignIO.parse(alignment_file, "emboss")) alignment = alignments[0] # Rename the sequence IDs alignment[0].id = a_seq_id alignment[1].id = b_seq_id # Add needle statistics as annotations in the alignment object stats = needle_statistics(alignment_file) alignment_ids = list(stats.keys()) if len(alignment_ids) > 1: raise ValueError('Needle alignment file contains more than one pairwise alignment') needle_id = alignment_ids[0] alignment.annotations['percent_identity'] = stats[needle_id]['percent_identity'] alignment.annotations['percent_similarity'] = stats[needle_id]['percent_similarity'] alignment.annotations['percent_gaps'] = stats[needle_id]['percent_gaps'] alignment.annotations['score'] = stats[needle_id]['score'] return alignment
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration line = line.strip() parts = [x for x in line.split() if x] if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") from None assert self._is_header(line) if (self.records_per_alignment is not None and self.records_per_alignment != number_of_seqs): raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] # By default, expects STRICT truncation / padding to 10 characters. # Does not require any whitespace between name and seq. for i in range(number_of_seqs): line = handle.readline().rstrip() sequence_id, s = self._split_id(line) ids.append(sequence_id) if "." in s: raise ValueError(_NO_DOTS) seqs.append([s]) # Look for further blocks line = "" while True: # Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break # end of file if not line: break # end of file if self._is_header(line): # Looks like the start of a concatenated alignment self._header = line break # print("New block...") for i in range(number_of_seqs): s = line.strip().replace(" ", "") if "." in s: raise ValueError(_NO_DOTS) seqs[i].append(s) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break # end of file records = (SeqRecord(Seq("".join(s)), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)) return MultipleSeqAlignment(records)
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: # Empty file - just give up. raise StopIteration if line.strip() != "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = OrderedDict( ) # Really only need an OrderedSet, but python lacks this gs = {} gr = {} gf = {} gc = {} passed_end_alignment = False while True: line = handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError( "Could not split line into identifier and sequence:\n" + line) seq_id, seq = parts if seq_id not in ids: ids[seq_id] = True seqs.setdefault(seq_id, "") seqs[seq_id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" feature, text = line[5:].strip().split(None, 2) if feature not in gc: gc[feature] = "" gc[feature] += text.strip() # append to any previous entry # Might be interleaved blocks, so can't check length yet elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" seq_id, feature, text = line[5:].strip().split(None, 2) # if seq_id not in ids: # ids.append(seq_id) if seq_id not in gs: gs[seq_id] = {} if feature not in gs[seq_id]: gs[seq_id][feature] = [text] else: gs[seq_id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" seq_id, feature, text = line[5:].strip().split(None, 2) # if seq_id not in ids: # ids.append(seq_id) if seq_id not in gr: gr[seq_id] = {} if feature not in gr[seq_id]: gr[seq_id][feature] = "" gr[seq_id][feature] += text.strip( ) # append to any previous entry # Might be interleaved blocks, so can't check length yet # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids.keys() self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if (self.records_per_alignment is not None and self.records_per_alignment != len(ids)): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for seq_id in ids: seq = seqs[seq_id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(seq_id) record = SeqRecord( Seq(seq, self.alphabet), id=seq_id, name=name, description=seq_id, annotations={"accession": name}, ) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(seq_id, record) records.append(record) for k, v in gc.items(): if len(v) != alignment_length: raise ValueError("%s length %i, expected %i" % (k, len(v), alignment_length)) alignment = MultipleSeqAlignment(records, self.alphabet) for k, v in sorted(gc.items()): if k in self.pfam_gc_mapping: alignment.column_annotations[self.pfam_gc_mapping[k]] = v elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping: alignment.column_annotations[self.pfam_gr_mapping[ k[:-5]]] = v else: # Ignore it? alignment.column_annotations["GC:" + k] = v # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def concatenate(alignments): """ Concatenates a list of multiple sequence alignment objects. The alignments are concatenated based on their label, i.e. the sequences from the different alignments which have the same id/labels will become a single sequence. The order is preserved. If any sequences are missing in one or several alignments, these parts are padded with unknown data (:py:class:`Bio.Seq.UnknownSeq`). :param alignments: the list of alignments objects, i.e. list(:py:class:`Bio.Align.MultipleSeqAlignment`) :returns: a single :py:class:`Bio.Align.MultipleSeqAlignment` Example:: >>> sequences = {'aln1': {'seq1': 'acgtca', ... 'seq2': 'acgtt-', ... 'seq3': 'ac-ta-'}, ... 'aln2': {'seq2': 'ttg-cta', ... 'seq3': 'tcgacta', ... 'seq4': 'ttgacta'}} >>> alignments = [MultipleSeqAlignment([SeqRecord(Seq(sequence, ... alphabet=IUPAC.extended_dna), id=key) ... for (key, sequence) in sequences[aln].items()]) ... for aln in ('aln1', 'aln2')] >>> con_alignment = concatenate(alignments) >>> con_alignment.sort() >>> print(con_alignment) ExtendedIUPACDNA() alignment with 4 rows and 13 columns acgtcaNNNNNNN seq1 acgtt-ttg-cta seq2 ac-ta-tcgacta seq3 NNNNNNttgacta seq4 :note: Limitations: any annotations in the sub-alignments are lost in the concatenated alignment. """ # First check to see whether we're inputting filenames of alignments or the Biopython alignments # Assume that it's a biopython alignment if it's not a filename tmp_aligns = [] for filename in alignments: if identify_input(filename).name == 'FILENAME': tmp_aligns.append(AlignIO.read(filename, "fasta")) else: tmp_aligns.append(filename) # Copy back to alignments alignments = tmp_aligns # Get the full set of labels (i.e. sequence ids) for all the alignments all_labels = set(seq.id for aln in alignments for seq in aln) # Make a dictionary to store info as we go along # (defaultdict is convenient -- asking for a missing key gives back an empty list) tmp = defaultdict(list) # Assume all alignments have same alphabet alphabet = alignments[0]._alphabet for aln in alignments: length = aln.get_alignment_length() # check if any labels are missing in the current alignment these_labels = set(rec.id for rec in aln) missing = all_labels - these_labels # if any are missing, create unknown data of the right length, # stuff the string representation into the tmp dict for label in missing: new_seq = UnknownSeq(length, alphabet=alphabet) tmp[label].append(str(new_seq)) # else stuff the string representation into the tmp dict for rec in aln: tmp[rec.id].append(str(rec.seq)) # Stitch all the substrings together using join (most efficient way), # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment return MultipleSeqAlignment( SeqRecord(Seq(''.join(v), alphabet=alphabet), id=k) for (k, v) in tmp.items())
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle line = handle.readline() if not line: raise StopIteration # Strip out header comments while line and line.strip().startswith('#'): line = handle.readline() seqs = {} seq_regions = {} passed_end_alignment = False latest_id = None while True: if not line: break # end of file line = line.strip() if line.startswith('='): # There may be more data, but we've reached the end of this # alignment break elif line.startswith('>'): m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) if not m: m = XMFA_HEADER_REGEX.match(line) if not m: raise ValueError("Malformed header line: %s", line) parsed_id = m.group('id') parsed_data = {} for key in ('start', 'end', 'id', 'strand', 'name', 'realname'): try: value = m.group(key) if key == 'start': value = int(value) # Convert to zero based counting if value > 0: value -= 1 if key == 'end': value = int(value) parsed_data[key] = value except IndexError: # This will occur if we're asking for a group that # doesn't exist. It's fine. pass seq_regions[parsed_id] = parsed_data if parsed_id not in self._ids: self._ids.append(parsed_id) seqs.setdefault(parsed_id, '') latest_id = parsed_id else: assert not passed_end_alignment if latest_id is None: raise ValueError("Saw sequence before definition line") seqs[latest_id] += line line = handle.readline() assert len(seqs) <= len(self._ids) self.ids = self._ids self.sequences = seqs if self._ids and seqs: alignment_length = max(map(len, list(seqs.values()))) records = [] for id in self._ids: if id not in seqs or len(seqs[id]) == 0 \ or len(seqs[id]) == 0: seq = '-' * alignment_length else: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) # Sometimes we don't see a particular sequence in the # alignment, so we skip that record since it isn't present in # that LCB/alignment if id not in seq_regions: continue if (seq_regions[id]['start'] != 0 or seq_regions[id]['end'] != 0): suffix = '/{start}-{end}'.format(**seq_regions[id]) if 'realname' in seq_regions[id]: corrected_id = seq_regions[id]['realname'] else: corrected_id = seq_regions[id]['name'] if corrected_id.count(suffix) == 0: corrected_id += suffix else: if 'realname' in seq_regions[id]: corrected_id = seq_regions[id]['realname'] else: corrected_id = seq_regions[id]['name'] record = SeqRecord(Seq(seq, self.alphabet), id=corrected_id, name=id) record.annotations["start"] = seq_regions[id]['start'] record.annotations["end"] = seq_regions[id]['end'] record.annotations[ "strand"] = 1 if seq_regions[id]['strand'] == '+' else -1 records.append(record) return MultipleSeqAlignment(records, self.alphabet) else: raise StopIteration
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def __next__(self): """Parse the next alignment from the handle.""" handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration while line.rstrip() != "#=======================================": line = handle.readline() if not line: raise StopIteration length_of_seqs = None number_of_seqs = None ids = [] header_dict = {} while line[0] == "#": # Read in the rest of this alignment header, # try and discover the number of records expected # and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) # Parse the rest of the header if key == "identity": header_dict["identity"] = int(parts[1].strip().split("/")[0]) if key == "similarity": header_dict["similarity"] = int(parts[1].strip().split("/")[0]) if key == "gaps": header_dict["gaps"] = int(parts[1].strip().split("/")[0]) if key == "score": header_dict["score"] = float(parts[1].strip()) # And read in another line... line = handle.readline() if number_of_seqs is None: raise ValueError("Number of sequences missing!") if length_of_seqs is None: raise ValueError("Length of sequences missing!") if (self.records_per_alignment is not None and self.records_per_alignment != number_of_seqs): raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] seq_starts = [] index = 0 # Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: # identifier, seq start position, seq, seq end position # (an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end if start >= end: # Special case, either a single letter is present, # or no letters at all. if seq.replace("-", "") == "": start = int(start) end = int(end) else: start = int(start) - 1 end = int(end) else: assert seq.replace("-", "") != "", repr(line) start = int(start) - 1 # python counting end = int(end) if index < 0 or index >= number_of_seqs: raise ValueError("Expected index %i in range [0,%i)" % (index, number_of_seqs)) # The identifier is truncated... assert id == ids[index] or id == ids[index][:len(id)] if len(seq_starts) == index: # Record the start seq_starts.append(start) # Check the start... if start >= end: assert seq.replace("-", "") == "", line elif start - seq_starts[index] != len(seqs[index].replace( "-", "")): raise ValueError( "Found %i chars so far for sequence %i (%s, %r), line says start %i:\n%s" % ( len(seqs[index].replace("-", "")), index, id, seqs[index], start, line, )) seqs[index] += seq # Check the end ... if end != seq_starts[index] + len(seqs[index].replace( "-", "")): raise ValueError( "Found %i chars so far for sequence %i (%s, %r, start=%i), file says end %i:\n%s" % ( len(seqs[index].replace("-", "")), index, id, seqs[index], seq_starts[index], end, line, )) index += 1 if index >= number_of_seqs: index = 0 else: # just a start value, this is just alignment annotation (?) # print "Skipping: " + line.rstrip() pass elif line.strip() == "": # Just a spacer? pass else: raise ValueError("Unrecognised EMBOSS pairwise line: %r\n" % line) line = handle.readline() if (line.rstrip() == "#---------------------------------------" or line.rstrip() == "#======================================="): # End of alignment self._header = line break assert index == 0 if (self.records_per_alignment is not None and self.records_per_alignment != len(ids)): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = [] for id, seq in zip(ids, seqs): if len(seq) != length_of_seqs: # EMBOSS 2.9.0 is known to use spaces instead of minus signs # for leading gaps, and thus fails to parse. This old version # is still used as of Dec 2008 behind the EBI SOAP webservice: # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") records.append( SeqRecord(Seq(seq, self.alphabet), id=id, description=id)) return MultipleSeqAlignment(records, self.alphabet, annotations=header_dict)
def filter_vcf(args): fd = args.input of = args.output ot = args.outtype vcf_reader = (vcf.Reader(fsock=fd) if fd == sys.stdin else vcf.Reader( filename=fd)) if ot == "vcf": vcf_writer = (vcf.Writer(of, vcf_reader) if of == sys.stdout else vcf.Writer(open(of, 'w'), vcf_reader)) else: newalignment = None samplenames = None for record in vcf_reader: gts = list() for sample in record.samples: sepchar = ( '/' if not sample.phased else '|' ) #replacing this is not strictly necessary, but makes sense to do bases = sample.gt_bases if bases != None: gts.append(set(bases.replace( sepchar, ''))) #sort to ensure A/T is not different from T/A else: gts.append(None) break if gts[-1] is None: continue c = gts[0].intersection( gts[1:] ) #c is the common allele. this will be empty if there were 2 mutations (ie A/A -> T/T) so remove those samples if len(c) == 0: continue commonalleles = [c] * len(gts) variablealleles = [ ((gt - c) if (gt - c) else gt) for gt in gts ] #if sample is homozygous for the common allele, its gt-c set will be empty, so its second allele is the same as its first if args.variable: commonalleles = [set()] * len(gts) if len(variablealleles[0].union(variablealleles[1:])) == 1: variablealleles = [set()] * len(gts) if commonalleles == variablealleles == [set()] * len(gts): continue combined = list() if skip: combined = baselist else: #convert to list of chars first commonalleles = [list(c)[0] if c else '' for c in commonalleles] variablealleles = [ list(v)[0] if v else '' for v in variablealleles ] combined = [c + v for c, v in zip(commonalleles, variablealleles)] if commonalleles == variablealleles == [set()] * len(gts): continue if ot == "vcf": vcf_writer.write_record(record) else: if not newalignment: newalignment = [''] * len(record.samples) samplenames = [x.sample for x in record.samples] newalignment = [ newalignment[j] + combined[j] for j in range(0, len(combined)) ] if ot == "vcf": vcf_writer.close() else: newseqobjs = [ SeqRecord(Seq(newalignment[l], IUPAC.unambiguous_dna), id=samplenames[l], description='') for l in range(0, len(newalignment)) ] newalnobj = MultipleSeqAlignment(newseqobjs) newalnobj = remove_duplicate_seqs(newalnobj) AlignIO.write(newalnobj, of, ot)
def project_CM(alnfile, contmat): """ project contact maps onto alignment """ from Bio.Align import MultipleSeqAlignment alignment = AlignIO.read(open(alnfile), "clustal") # filter dssp entries from alignment file alignment_nr = MultipleSeqAlignment([]) for ix, record in enumerate(alignment): if "dssp" not in record.id and "space" not in record.id: alignment_nr.append(record) alignment = alignment_nr nrow = len(alignment) ncol = alignment.get_alignment_length() mapping = {} list_id=[] CM = np.zeros(( ncol, ncol, nrow )) for ix, record in enumerate(alignment): seq_aln = np.array(record.seq) seq_ref = "".join(list(seq_aln[seq_aln!='-'])) ident = record.id.split('_')[0] list_id.append(ident) current_substr = structures[structures['ORF'] == ident]['substrate'].item() current_mapping = {} # dict of aln position for seq position pos_aln = 0 pos_ref = 1 # this is a FIX for python indexing vs. PDB while (pos_aln < ncol): if seq_aln[pos_aln] == '-': pos_aln += 1 else: current_mapping[pos_ref] = pos_aln pos_aln += 1 pos_ref += 1 mapping[ident] = current_mapping current_cm = CONTMAT[ident] for n1, n2 in current_cm.edges(): aln1 = current_mapping[n1] aln2 = current_mapping[n2] CM[aln1, aln2, ix] = CM[aln2, aln1, ix] = 1 cons_cm = np.sum(CM, 2) np.savetxt("../data/processed/consensus_cm.txt", cons_cm, fmt='%i' ) cm_df = pd.DataFrame(columns=['i', 'j', 'n']) counter = 0 for i in range(ncol): for j in range(ncol): if cons_cm[i,j] > 0: cm_df.loc[counter] = [i, j, cons_cm[i,j]] counter += 1 cm_df.to_csv("../data/processed/consensus_cm_df.txt", sep='\t', header=True, index=False) return CM, list_id
from Bio.Alphabet import generic_protein from Bio.Align import MultipleSeqAlignment from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord seq1 = 'MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW' seq2 = 'MH--IFIYQIGYALKSGYIQSIRSPEY-NW' seq_rec_1 = SeqRecord(Seq(seq1, generic_protein), id='asp') seq_rec_2 = SeqRecord(Seq(seq2, generic_protein), id='unk') align = MultipleSeqAlignment([seq_rec_1, seq_rec_2]) print(align)
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration line = line.strip() parts = filter(None, line.split()) if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0, number_of_seqs): line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ", "")]) #Look for further blocks line = "" while True: #Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break #end of file if not line: break #end of file if self._is_header(line): #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0, number_of_seqs): seqs[i].append(line.strip().replace(" ", "")) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break #end of file records = (SeqRecord(Seq("".join(s), self.alphabet), \ id=i, name=i, description=i) \ for (i,s) in zip(ids, seqs)) return MultipleSeqAlignment(records, self.alphabet)
try: print(next(SeqIO.parse(h, t_format, given_alpha))) h.close() assert False, "Forcing wrong alphabet, %s, should fail (%s)" \ % (repr(given_alpha), t_filename) except ValueError: # Good - should fail pass h.close() del good, bad, given_alpha, base_alpha if t_alignment: print("Testing reading %s format file %s as an alignment" % (t_format, t_filename)) alignment = MultipleSeqAlignment( SeqIO.parse(handle=t_filename, format=t_format)) assert len(alignment) == t_count alignment_len = alignment.get_alignment_length() # Check the record order agrees, and double check the # sequence lengths all agree too. for i in range(t_count): assert compare_record(records[i], alignment[i]) assert len(records[i].seq) == alignment_len print(alignment_summary(alignment)) # Some alignment file formats have magic characters which mean # use the letter in this position in the first sequence. # They should all have been converted by the parser, but if
def aln_maker(vcf, reference, contig, start=None, stop=None, vcf_modifier=None, add_ref=False, check=True, alphabet=None, gatkwc2ref=True, wosamples=[], indels=False): if isinstance(vcf, str): vcf = VI(vcf, vcf_modifier) if vcf_modifier: vcf.modifier = vcf_modifier with open(reference) as f: fdata = SeqIO.parse(f, 'fasta') fdata = {res.id: res.seq for res in fdata} samples = vcf.samples if add_ref and "Reference" in samples: raise AlnException( "Cannot add reference : A sample has already this name") if add_ref: samples.append("Reference") if contig not in fdata: raise AlnException("Contig not found in the reference sequence : %s" % (contig)) # sequence start = start or 0 sequence = str(fdata[contig][start:stop]) data = { sample: [MutableSeq(sequence) for i in range(ploidy)] for sample, ploidy in vcf.ploidies.items() } if add_ref: data["Reference"] = [sequence] if DEBUG: print(sequence) if indels: _infer_indels(data, vcf, sequence, contig, start, stop, check) else: _infer_snps(data, vcf, sequence, contig, start, stop, gatkwc2ref, check) alphabet = alphabet or unambiguous_dna data = { "%s_%i" % (sample, idx): Seq(str(sequence), alphabet) for sample, sequences in data.items() for idx, sequence in enumerate(sequences, start=1) if sample not in wosamples } stop = stop or len(sequence) desc = "%s - %i - %i" % (contig, start, stop) return MultipleSeqAlignment( SeqRecord(data[sample], id=sample, name=sample, description=desc) for sample in sorted(data))