def main(): parser = argparse.ArgumentParser( description='Export the transcript from each PASA cluster with the longest ORF') parser.add_argument('-if', '--input_pasa_fasta', type=str, required=True, help='Path to PASAs predicted FASTA file' ) parser.add_argument('-itsc', '--input_pasa_transcript_size_cutoff', type=str, required=False, default=300, help='Nucleotide size cutoff of input transcripts' ) parser.add_argument('-itc', '--input_pasa_tpm_cutoff', type=float, required=False, default=0.1, help='TPM cutoff of input transcripts' ) parser.add_argument('-ig', '--input_pasa_gtf', type=str, required=True, help='Path to PASAs predicted GFT file' ) parser.add_argument('-it', '--input_transdecoder', type=str, required=True, help='Path to the longest_orfs.cds from transdecoder' ) parser.add_argument('-is', '--input_salmon', type=str, required=True, help='Path to quant.sf file from Salmon' ) args = parser.parse_args() seqs = utils.fasta_dict_from_file(args.input_pasa_fasta) pasa_clusters = load_pasa_clusters(seqs, args.input_pasa_gtf) orf_lengths = longest_orf_lengths(args.input_transdecoder) load_abundances(seqs, args.input_salmon) pasa_cluster_count = len(pasa_clusters) pasa_transcript_count = len(seqs) size_filtered_pasa_transcript_count = 0 size_filtered_pasa_transcript_count_with_cds = 0 size_filtered_pasa_cluster_count = 0 tpm_filtered_pasa_transcript_count = 0 tpm_filtered_pasa_cluster_count = 0 print("Stats:\n") print("Initial PASA cluster count: {0}".format(pasa_cluster_count)) print("Initial PASA transcript count: {0}".format(pasa_transcript_count)) seqs, pasa_clusters = apply_orf_filter(seqs, pasa_clusters, orf_lengths, 100) write_unigenes(seqs, pasa_clusters, 'pasa.orf_filtered', orf_lengths) seqs, pasa_clusters = apply_abundance_filter(seqs, pasa_clusters, args.input_pasa_tpm_cutoff) write_unigenes(seqs, pasa_clusters, 'pasa.orf_and_tpm_0.03_filtered', orf_lengths)
def process_fasta(mols, fasta_file): fasta_seqs = utils.fasta_dict_from_file(fasta_file) for mol_id in mols: # check if the FASTA file provides sequence for this if mol_id in fasta_seqs: mols[mol_id] = fasta_seqs[mol_id]['s']
def main(): parser = argparse.ArgumentParser( description='Transforms a tab-delimited annotation file to PathoLogic format') ## output file to be written parser.add_argument('-a', '--annotation_tab', type=str, required=True, help='Path to an input file to be parsed' ) parser.add_argument('-g', '--genomic_fasta', type=str, required=True, help='Underlying nucleotide FASTA file for the annotated proteins' ) parser.add_argument('-p', '--protein_fasta', type=str, required=True, help='Protein input sequences to the pipeline, with specific headers required' ) parser.add_argument('-o', '--output_dir', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() molecules = utils.fasta_dict_from_file(args.genomic_fasta) protein_coords = get_protein_coordinates_from_FASTA(args.protein_fasta) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) create_subdirectories(molecules, args.output_dir) write_elements_file(molecules, args.output_dir) write_seq_files(molecules, args.output_dir) genes = dict() for line in open(args.annotation_tab): if line.startswith("#"): continue parse_annotation_line( line, genes, molecules ) write_annotation_files( genes, molecules, protein_coords, args.output_dir )
def main(): parser = argparse.ArgumentParser( description='Reports base/residue composition of a FASTA file') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) total_residues = 0 total_residue_counts = Counter() for seq_id in seqs: total_residues += len(seqs[seq_id]['s']) seq_residue_counts = Counter(seqs[seq_id]['s']) for residue in seq_residue_counts: total_residue_counts[residue] += seq_residue_counts[residue] fout.write("# Total residues found: {0}\n".format(total_residues)) for residue in total_residue_counts: residue_count = total_residue_counts[residue] residue_perc = (residue_count / total_residues) * 100 fout.write("{0}\t{1}\t{2}\n".format(residue, residue_count, residue_perc))
def main(): parser = argparse.ArgumentParser( description='Read a multi-FASTA file sequence and remove duplicates (by MD5 hash)') ## output file to be written parser.add_argument('-i', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) found = dict() m = hashlib.md5() for seq_id in seqs: seq = seqs[seq_id]['s'] m.update(seq.encode()) md5sum = m.hexdigest() ## write this sequence, 60bp per line if md5sum not in found: fout.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h'])) for i in range(0, len(seq), 60): fout.write(seq[i : i + 60] + "\n") found[md5sum] = 1
def main(): parser = argparse.ArgumentParser( description='Replaces long homopolymeric stretches with N characters') parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-hll', '--homopolymer_length_limit', type=int, required=True, help='Stretches of non-N residues longer than this will be replaced with Ns' ) args = parser.parse_args() if args.output is None: out_fh = sys.stdout else: out_fh = open( args.output, 'wt' ) sys.stderr.write("INFO: Parsing input FASTA\n") sys.stderr.flush() seqs = utils.fasta_dict_from_file(args.input) sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format(args.homopolymer_length_limit)) sys.stderr.flush() for seq_id in seqs: seq = seqs[seq_id] current_seq = seq['s'] current_homopolymer_base = None current_homopolymer_length = 0 current_homopolymer_start_idx = 0 base_index = 0 for base in list(seq['s']): if base == current_homopolymer_base: current_homopolymer_length += 1 else: if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write("WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n".format( seq_id, current_homopolymer_start_idx + 1, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format(seq['s'][0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, seq['s'][base_index:]) current_homopolymer_base = base current_homopolymer_length = 1 current_homopolymer_start_idx = base_index base_index += 1 ## check after the last row for any runs which terminate the sequence if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write("WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n".format( seq_id, current_homopolymer_start_idx, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format(current_seq[0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, current_seq[base_index:]) seqs[seq_id]['s'] = current_seq out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h'])) out_fh.write(utils.wrapped_fasta(seqs[seq_id]['s'])) out_fh.write("\n")
def main(): parser = argparse.ArgumentParser( description='Merge masked FASTA files') ## output file to be written parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') files = args.fasta_files # pull off a file and index it seqs = utils.fasta_dict_from_file(files.pop()) # python strings are immutable, so we need to transform these into lists for seq_id in seqs: seqs[seq_id]['s'] = list(seqs[seq_id]['s']) for fasta_file in args.fasta_files: new_seqs = utils.fasta_dict_from_file(fasta_file) for seq_id in new_seqs: # make sure it exists in the source file if seq_id not in seqs: raise Exception("ERROR: Seq ID {0} was found in file {1} but not in the seed file".format(seq_id, fasta_file) ) # they should also be the same length if len(seqs[seq_id]) != len(new_seqs[seq_id]): raise Exception("ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths".format(seq_id, fasta_file)) i = 0 for base in new_seqs[seq_id]['s']: if base != seqs[seq_id]['s'][i]: if base == 'N': seqs[seq_id]['s'][i] = 'N' elif seqs[seq_id]['s'][i] != 'N': print("WARNING: Disagreement {0}-{1} at position {2}".format(base, seqs[seq_id]['s'][i], i) ) i += 1 # now done, print out the results for seq_id in seqs: ofh.write( ">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], utils.wrapped_fasta(''.join(seqs[seq_id]['s']))))
def main(): parser = argparse.ArgumentParser( description= 'Reverse or reverse-complement selected sequences within a multi-FASTA' ) ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-i', '--id_file', type=str, required=True, help='Path to file with IDs to process') parser.add_argument( '-a', '--action', type=str, required=True, choices=['reverse', 'revcomp'], help='What should be done to the sequences in the ID file') parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) ids = list() for line in open(args.id_file): line = line.rstrip() ids.append(line) for seq_id in seqs: seq = seqs[seq_id] if seq_id in ids: if args.action == 'reverse': seq['s'] = seq['s'][::-1] elif args.action == 'revcomp': seq['s'] = utils.reverse_complement(seq['s']) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq['s']), 60): fout.write(seq['s'][i:i + 60] + "\n")
def process_assembly_fasta(mols, fasta_file): fasta_seqs = utils.fasta_dict_from_file(fasta_file) for mol_id in mols: # check if the FASTA file provides sequence for this if mol_id in fasta_seqs: mol = mols[mol_id] mol.residues = fasta_seqs[mol_id]['s'] mol.length = len(mol.residues)
def main(): parser = argparse.ArgumentParser( description="Reverse or reverse-complement selected sequences within a multi-FASTA" ) ## output file to be written parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file") parser.add_argument("-i", "--id_file", type=str, required=True, help="Path to file with IDs to process") parser.add_argument( "-a", "--action", type=str, required=True, choices=["reverse", "revcomp"], help="What should be done to the sequences in the ID file", ) parser.add_argument( "-o", "--output_file", type=str, required=False, default=None, help="Optional Path to an output file to be created", ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, "wt") seqs = utils.fasta_dict_from_file(args.fasta_file) ids = list() for line in open(args.id_file): line = line.rstrip() ids.append(line) for seq_id in seqs: seq = seqs[seq_id] if seq_id in ids: if args.action == "reverse": seq["s"] = seq["s"][::-1] elif args.action == "revcomp": seq["s"] = utils.reverse_complement(seq["s"]) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq["s"]), 60): fout.write(seq["s"][i : i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates' ) parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers' ) parser.add_argument('-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions' ) parser.add_argument('-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions' ) parser.add_argument('-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment' ) parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) start_col = args.start_coord_col - 1 stop_col = args.stop_coord_col - 1 mol_col = args.mol_col - 1 for line in open(args.coords_file): line = line.rstrip() cols = line.split('\t') if len(cols) < 3: continue (fmin, fmax, strand) = utils.humancoords_to_0interbase(int(cols[start_col]), int(cols[stop_col])) mol_id = cols[mol_col] if mol_id not in seqs: raise Exception("ERROR: molecule ID ({0}) not found in FASTA file".format(mol_id)) seq = seqs[mol_id]['s'][fmin:fmax] seq_id = None if args.name_col is None: seq_id = "{0}___{1}.{2}.{3}".format( mol_id, fmin, fmax, strand ) else: seq_id = cols[int(args.name_col) - 1] if strand == -1: seq = utils.reverse_complement(seq) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq), 60): fout.write(seq[i : i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description= 'Transforms a tab-delimited annotation file to PathoLogic format') ## output file to be written parser.add_argument('-a', '--annotation_tab', type=str, required=True, help='Path to an input file to be parsed') parser.add_argument( '-g', '--genomic_fasta', type=str, required=True, help='Underlying nucleotide FASTA file for the annotated proteins') parser.add_argument( '-p', '--protein_fasta', type=str, required=True, help= 'Protein input sequences to the pipeline, with specific headers required' ) parser.add_argument('-o', '--output_dir', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() molecules = utils.fasta_dict_from_file(args.genomic_fasta) protein_coords = get_protein_coordinates_from_FASTA(args.protein_fasta) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) create_subdirectories(molecules, args.output_dir) write_elements_file(molecules, args.output_dir) write_seq_files(molecules, args.output_dir) genes = dict() for line in open(args.annotation_tab): if line.startswith("#"): continue parse_annotation_line(line, genes, molecules) write_annotation_files(genes, molecules, protein_coords, args.output_dir)
def get_protein_coordinates_from_FASTA(protein_fasta): ''' This function is probably only relevant to a limited number of tasks where the protein input FASTA file to the pipeline was produced by transdecoder, which incorporates the predicted ORF coordinates into the FASTA header, like this: >m.13585 g.13585 ORF g.13585 m.13585 type:3prime_partial len:76 (+) comp100033_c0_seq1:118-348(+) Of all that, the only part we care about is that the first model number 'm.13585' matches that of the second column in the annotation file, along with the matching genomic molecule name at the end of the header 'comp100033_c0_seq1:118-348(+) Returns a dict keyed on the model name (like 'm.13585') with the followed keyed values: 'mol' = molecule ID (like 'comp100033_c0_seq1') 'fmin' = 0-interbase start coordinate (117, from example above) 'fmax' = 0-interbase stop coordinate (348, from example above) 'strand' = 1, 0 or -1 direction ''' protein_locs = dict() fasta_dict = utils.fasta_dict_from_file(protein_fasta) pattern = re.compile('(comp\d+_c\d+_seq\d+)\:(\d+)\-(\d+)\(\+\)') for model_id in fasta_dict: if model_id in protein_locs: raise Exception( "ERROR: found duplicate model ID in file: {0}".format( protein_fasta)) m = pattern.search(fasta_dict[model_id]['h']) if m: protein_locs[model_id] = { 'mol': m.group(1), 'fmin': int(m.group(2)) - 1, 'fmax': int(m.group(3)), 'strand': 1 } else: raise Exception( "ERROR: unexpected header format. Expected to parse something like comp100033_c0_seq1:118-348(+). Got: {0}" .format(fasta_dict[model_id]['h'])) return protein_locs
def initialize_polypeptides( log_fh, fasta_file ): ''' Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached. ''' seqs = utils.fasta_dict_from_file(fasta_file) polypeptides = dict() for seq_id in seqs: polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s']) annot = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME) log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME)) polypeptide.annotation = annot polypeptides[seq_id] = polypeptide return polypeptides
def main(): parser = argparse.ArgumentParser( description= 'Read a multi-FASTA file sequence and remove duplicates (by MD5 hash)') ## output file to be written parser.add_argument('-i', '--fasta_file', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) found = dict() m = hashlib.md5() for seq_id in seqs: seq = seqs[seq_id]['s'] m.update(seq.encode()) md5sum = m.hexdigest() ## write this sequence, 60bp per line if md5sum not in found: fout.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h'])) for i in range(0, len(seq), 60): fout.write(seq[i:i + 60] + "\n") found[md5sum] = 1
def main(): parser = argparse.ArgumentParser( description='Split multi-FASTA file into separate protein and nucleotide files') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-p', '--protein', type=str, required=False, help='Path to a tab-delimited file with coordinates' ) parser.add_argument('-n', '--nucleotide', type=str, required=False, help='Tabdel file column with molecule identifiers' ) parser.add_argument('-c', '--cutoff', type=str, required=False, default=80, help='Min percent (1-100) of ATGCNX content to be considered a nucleotide sequence' ) args = parser.parse_args() pout = nout = None if args.protein is not None: pout = open(args.protein, 'wt') if args.nucleotide is not None: nout = open(args.nucleotide, 'wt') ## the user should have specified at least one if pout is None and nout is None: raise Exception("ERROR: you must specify either -p or -n options (else why are you running this script?") seqs = utils.fasta_dict_from_file(args.input) for seq_id in seqs: seq = seqs[seq_id] seqcomp = nucleotide_composition( seq['s'] ) seq_wrapped = wrapped(seq['s'], every=60) if seqcomp >= args.cutoff: ## it's a nucleotide if nout is not None: nout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'], seq_wrapped ) ) else: ## it's a protein if pout is not None: pout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'], seq_wrapped ) )
def main(): parser = argparse.ArgumentParser( description='Reformats a FASTA file such that there are no more than -w characters of sequence residues per line.') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-w', '--width', type=int, required=False, default=60, help='Width - number of residues per line' ) parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT' ) parser.add_argument('-uc', '--upper_case', action='store_true', required=False, help='Forces all bases to be upper-case' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = utils.fasta_dict_from_file(args.input) for seq_id in seqs: if args.upper_case == True: seqs[seq_id]['s'] = seqs[seq_id]['s'].upper() seq_wrapped = utils.wrapped_fasta(seqs[seq_id]['s'], every=args.width) fout.write(">{0} {1}\n{2}\n".format(seq_id, seqs[seq_id]['h'], seq_wrapped))
def main(): parser = argparse.ArgumentParser( description='Creates a GFF3 file from a genomic FASTA') ## output file to be written parser.add_argument('-i', '--input_fasta', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_gff3', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-s', '--source', type=str, required=True, help='Source, fills column 2 of the output GFF3 file' ) parser.add_argument('-t', '--molecule_term', type=str, required=False, default='contig', help='SO term to represent genomic sequence type') args = parser.parse_args() ofh = open(args.output_gff3, 'wt') seqs = utils.fasta_dict_from_file(args.input_fasta) # header ofh.write("##gff-version 3\n") for seq_id in seqs: ofh.write("{0}\t{1}\t{2}\t1\t{3}\t.\t.\t.\tID={0}".format(seq_id, args.source, args.molecule_term, len(seqs[seq_id]['s']))) if len(seqs[seq_id]['h']) > 0: ofh.write(";Name={0}".format(seqs[seq_id]['h'])) ofh.write("\n")
def main(): parser = argparse.ArgumentParser( description='Reports longest ORF length in all frames') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) args = parser.parse_args() seqs = utils.fasta_dict_from_file(args.input_file) for seq_id in seqs: winning_frame = None winning_frame_length = 0 for frame in range(1, 4): offset = frame - 1 seq = seqs[seq_id]['s'][offset:] polyseq = utils.translate(seq) longest_len = 0 current_len = 0 for base in polyseq: if base == '*': if current_len > longest_len: longest_len = current_len current_len = 0 else: current_len += 1 if current_len > longest_len: longest_len = current_len if longest_len > winning_frame_length: winning_frame = frame winning_frame_length = longest_len print("{0}\t{1}\t{2}".format(seq_id, winning_frame, winning_frame_length))
def get_protein_coordinates_from_FASTA(protein_fasta): ''' This function is probably only relevant to a limited number of tasks where the protein input FASTA file to the pipeline was produced by transdecoder, which incorporates the predicted ORF coordinates into the FASTA header, like this: >m.13585 g.13585 ORF g.13585 m.13585 type:3prime_partial len:76 (+) comp100033_c0_seq1:118-348(+) Of all that, the only part we care about is that the first model number 'm.13585' matches that of the second column in the annotation file, along with the matching genomic molecule name at the end of the header 'comp100033_c0_seq1:118-348(+) Returns a dict keyed on the model name (like 'm.13585') with the followed keyed values: 'mol' = molecule ID (like 'comp100033_c0_seq1') 'fmin' = 0-interbase start coordinate (117, from example above) 'fmax' = 0-interbase stop coordinate (348, from example above) 'strand' = 1, 0 or -1 direction ''' protein_locs = dict() fasta_dict = utils.fasta_dict_from_file(protein_fasta) pattern = re.compile('(comp\d+_c\d+_seq\d+)\:(\d+)\-(\d+)\(\+\)') for model_id in fasta_dict: if model_id in protein_locs: raise Exception("ERROR: found duplicate model ID in file: {0}".format(protein_fasta) ) m = pattern.search(fasta_dict[model_id]['h']) if m: protein_locs[model_id] = { 'mol': m.group(1), 'fmin': int(m.group(2)) - 1, 'fmax': int(m.group(3)), 'strand': 1} else: raise Exception("ERROR: unexpected header format. Expected to parse something like comp100033_c0_seq1:118-348(+). Got: {0}".format(fasta_dict[model_id]['h'])) return protein_locs
def main(): parser = argparse.ArgumentParser( description='Generates a figure showing coverage/abundance vs. molecule size.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input pileup file' ) parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to the FASTA file of reference molecules' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-s', '--mol_size_limit', type=int, required=False, default=5000, help='Results for molecules over this size will be grouped together' ) parser.add_argument('-b', '--mol_bin_size', type=int, required=False, default=10, help='Set the binning resolution of the transcript size axis') args = parser.parse_args() ## first, we need a collection of the FASTA data and the molecule lengths molecules = utils.fasta_dict_from_file(args.fasta_file) ## data points for plotting # structure like this: # 500 = { 30 => 2 } # which means: There were 2 transcripts with median coverage of 30 and length between 500 and 500+mol_bin_size data_bins = defaultdict(lambda: defaultdict(int)) current_molecule_id = None current_molecule_coverages = list() ## These files are usually huge. For scalability, operations performed within this # loop should be limited. for line in open(args.input_file): cols = line.split("\t") if current_molecule_id is None: current_molecule_id = cols[0] current_molecule_coverages = [0] * len(molecules[cols[0]]['s']) if cols[0] != current_molecule_id: mol_length_bin = int(len(molecules[current_molecule_id]['s']) / args.mol_bin_size) median_size = np.median(current_molecule_coverages) data_bins[mol_length_bin][median_size] += 1 print("DEBUG: molecule {0} appeared to be {1} bp in length with median coverage of {2}".format(current_molecule_id, len(molecules[current_molecule_id]['s']), median_size)) # reset current_molecule_id = cols[0] current_molecule_coverages = [0] * len(molecules[cols[0]]['s']) try: current_molecule_coverages[int(cols[1]) - 1] = int(cols[3]) except IndexError: print("ERROR: pileup file reports position {0} coverage but transcript {1} is only {2} bp in length".format(cols[1], current_molecule_id, len(molecules[cols[0]]['s'])) ) # don't forget the last one mol_length_bin = int(len(molecules[cols[0]]['s']) / args.mol_bin_size) median_size = np.median(current_molecule_coverages) data_bins[mol_length_bin][median_size] += 1 ## now generate the plot data - x,y positions and radii x = list() y = list() r = list() for bin_size in data_bins: for cov in data_bins[bin_size]: x.append(bin_size) y.append(cov) r.append(data_bins[bin_size][cov]) plt.xlabel('Molecule length') plt.ylabel('Median depth of coverage') #plt.xlim(0,2000) #plt.ylim(0,500) plt.scatter(x, y, s=r, alpha=0.5) if args.output_file == 'plot': plt.show() else: plt.savefig(args.output_file)
def main(): parser = argparse.ArgumentParser( description= 'Generates a figure showing coverage/abundance vs. molecule size.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input pileup file') parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to the FASTA file of reference molecules') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') parser.add_argument( '-s', '--mol_size_limit', type=int, required=False, default=5000, help='Results for molecules over this size will be grouped together') parser.add_argument( '-b', '--mol_bin_size', type=int, required=False, default=10, help='Set the binning resolution of the transcript size axis') args = parser.parse_args() ## first, we need a collection of the FASTA data and the molecule lengths molecules = utils.fasta_dict_from_file(args.fasta_file) ## data points for plotting # structure like this: # 500 = { 30 => 2 } # which means: There were 2 transcripts with median coverage of 30 and length between 500 and 500+mol_bin_size data_bins = defaultdict(lambda: defaultdict(int)) current_molecule_id = None current_molecule_coverages = list() ## These files are usually huge. For scalability, operations performed within this # loop should be limited. for line in open(args.input_file): cols = line.split("\t") if current_molecule_id is None: current_molecule_id = cols[0] current_molecule_coverages = [0] * len(molecules[cols[0]]['s']) if cols[0] != current_molecule_id: mol_length_bin = int( len(molecules[current_molecule_id]['s']) / args.mol_bin_size) median_size = np.median(current_molecule_coverages) data_bins[mol_length_bin][median_size] += 1 print( "DEBUG: molecule {0} appeared to be {1} bp in length with median coverage of {2}" .format(current_molecule_id, len(molecules[current_molecule_id]['s']), median_size)) # reset current_molecule_id = cols[0] current_molecule_coverages = [0] * len(molecules[cols[0]]['s']) try: current_molecule_coverages[int(cols[1]) - 1] = int(cols[3]) except IndexError: print( "ERROR: pileup file reports position {0} coverage but transcript {1} is only {2} bp in length" .format(cols[1], current_molecule_id, len(molecules[cols[0]]['s']))) # don't forget the last one mol_length_bin = int(len(molecules[cols[0]]['s']) / args.mol_bin_size) median_size = np.median(current_molecule_coverages) data_bins[mol_length_bin][median_size] += 1 ## now generate the plot data - x,y positions and radii x = list() y = list() r = list() for bin_size in data_bins: for cov in data_bins[bin_size]: x.append(bin_size) y.append(cov) r.append(data_bins[bin_size][cov]) plt.xlabel('Molecule length') plt.ylabel('Median depth of coverage') #plt.xlim(0,2000) #plt.ylim(0,500) plt.scatter(x, y, s=r, alpha=0.5) if args.output_file == 'plot': plt.show() else: plt.savefig(args.output_file)
def main(): parser = argparse.ArgumentParser(description="Put a description of your script here") parser.add_argument("-a", "--organism1_annotation", type=str, required=True, help="Annotation GFF for organism 1") parser.add_argument( "-p", "--organism1_aat_alignments", type=str, required=True, help="Path to AAT GFF3 (match/match_part)" ) parser.add_argument( "-aatdb", "--aat_fasta_db", type=str, required=True, help="Path to FASTA database that was used in AAT" ) parser.add_argument( "-b", "--organism1_blast_alignments", type=str, required=True, help="Path to BLASTp btab file vs.organism 2 proteins", ) parser.add_argument( "-be", "--blast_eval_cutoff", type=float, required=False, default=1e-5, help="BLAST e-value cutoff" ) parser.add_argument( "-bpi", "--blast_percent_identity_cutoff", type=float, required=False, default=0, help="BLAST %identity cutoff" ) parser.add_argument( "-ppc", "--aat_percent_coverage_cutoff", type=float, required=False, default=0, help="% coverage of the query protein by the AAT match", ) parser.add_argument( "-o", "--output_id_list", type=str, required=False, help="List of IDs from organism1 that passed" ) args = parser.parse_args() debugging_transcript = None ## if the output file wasn't passed build one from the other parameters if args.output_id_list is None: args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format( args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff ) print("INFO: Parsing organism1 annotation") (assemblies, features) = gff.get_gff3_features(args.organism1_annotation) print("INFO: Parsing AAT FASTA database") aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db) # keys are assembly IDs, value for each is a list of matches on them aat_matches = dict() aat_match_count = 0 current_match = None ## IDs of features in organism 1 which overlap AAT o1_with_aat = list() o1_with_o2 = list() print("INFO: Parsing organism1 AAT protein alignments") for line in open(args.organism1_aat_alignments): cols = line.split("\t") if line.startswith("#") or len(cols) != 9: continue assembly_id = cols[0] # skip this match if there were not predicted genes on the same assembly if assembly_id not in assemblies: continue if assembly_id not in aat_matches: aat_matches[assembly_id] = list() fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] feature_id = gff.column_9_value(cols[8], "ID").replace('"', "") target = gff.column_9_value(cols[8], "Target") m = re.search("^(\S+)", target) if m: target = m.group(1) if cols[2] == "nucleotide_to_protein_match": if current_match is not None: aat_matches[assembly_id].append(current_match) aat_match_count += 1 current_match = things.Match( id=feature_id, target_id=target, subclass="nucleotide_to_protein_match", length=fmax - fmin ) current_match.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) elif cols[2] == "match_part": parent_id = gff.column_9_value(cols[8], "Parent").replace('"', "") match_part = things.MatchPart(id=feature_id, parent=parent_id, length=fmax - fmin) match_part.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(match_part) print("INFO: Parsed {0} protein alignment chains".format(aat_match_count)) print("INFO: Comparing organism1's mRNAs with AAT match coordinates") for assembly_id in assemblies: if assembly_id not in aat_matches: continue assembly = assemblies[assembly_id] for gene in assembly.genes(): for mRNA in gene.mRNAs(): if debugging_transcript is not None: if mRNA.id == debugging_transcript: print("DEBUG: processing debugging transcript: {0}".format(mRNA.id)) else: continue for aat_match in aat_matches[assembly_id]: # print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) ) overlap_size = mRNA.overlap_size_with(aat_match) if overlap_size is not None: # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) ) # this shouldn't be possible, but check just in case if overlap_size > mRNA.length: raise Exception( "ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length) ) if aat_match.target_id not in aat_seqs: raise Exception( "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format( aat_match.target_id ) ) # this is a protein length, so x3 match_target_length = len(aat_seqs[aat_match.target_id]["s"]) * 3 (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage( mRNA, aat_match, match_target_length ) # print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) ) # print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) ) if ( mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff ): o1_with_aat.append(mRNA.id) # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \ # mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \ # aat_match.target_id, match_target_length) ) # print("\tmRNA % cov: {0}".format(mRNA_percent_coverage)) # print("\ttarget % cov: {0}".format(target_percent_coverage)) break # only need to see if one matched print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat))) # key=org1_transcript_id, value=org2_transcript_id top_blast_hits = dict() print("INFO: parsing BLAST results vs. org2") for line in open(args.organism1_blast_alignments): cols = line.split("\t") if float(cols[19]) > args.blast_eval_cutoff: continue if float(cols[10]) < args.blast_percent_identity_cutoff: continue # if we survived until here, this one's good. top_blast_hits[cols[0]] = cols[5] print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones") for o1_mRNA_id in o1_with_aat: if o1_mRNA_id in top_blast_hits: o1_with_o2.append(o1_mRNA_id) print( "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2)) ) id_list_fh = open(args.output_id_list, "wt") for mRNA_id in o1_with_o2: id_list_fh.write("{0}\n".format(mRNA_id))
def main(): parser = argparse.ArgumentParser( description= 'Parses nucmer coords output to provide an overall coverage report') ## coords file generated with: show-coords -l -r -T out.delta parser.add_argument('-c', '--coords_file', type=str, required=True, \ help='Path to a nucmer coords file with non-overlapping results (requires -l -r -T options of show-coords)' ) parser.add_argument( '-o', '--output_prefix', type=str, required=True, help='Several output files will be created with this prefix.') parser.add_argument('-a', '--annotation_file', type=str, required=True, help='Path to a sorted GFF3 annotation file') parser.add_argument('-r', '--reference_fasta', type=str, required=True, help='Path to the reference file used with nucmer') parser.add_argument( '-k', '--annotation_key', type=str, required=False, help= 'Optional. Key string to look for in the 9th column of the GFF3 file for an annotation string.' ) args = parser.parse_args() ## like: h[$assem] = [ {id=?,fmin=?,fmax=?}, ... ] annot = parse_annotation(args.annotation_file, args.annotation_key) ## like: [ {id=?,qfmin=?,qfmax=?,rfmin=?,rfmax=?} ] query_fragments = [] ref_molecules = utils.fasta_dict_from_file(args.reference_fasta) ref_n_total = 0 for ref_id in ref_molecules: ref_n_total += len(ref_molecules[ref_id]['s']) ## open the output files genecov_stats_ofh = open(args.output_prefix + ".stats.gene_coverage", "wt") genesmissing_list_ofh = open(args.output_prefix + ".list.genes_missing", "wt") refmol_stats_ofh = open(args.output_prefix + ".stats.refmol_coverage", "wt") refcov_stats_ofh = open(args.output_prefix + ".tab.refmol_coverage", "wt") refext_list_ofh = open(args.output_prefix + ".tab.extensions", "wt") genecov_tab_ofh = open(args.output_prefix + ".tab.gene_coverage", "wt") refext_list_ofh.write("# {0}\n".format(args.output_prefix)) refext_list_ofh.write( "# reference_id\tref_fmin\tref_fmax\tref_strand\tqry_id\tqry_fmin\tqry_fmax\tqry_strand\tqry_length\n" ) ref_cov_stats = {'n_cov': 0, 'n_uncov': 0, 'n_identical': 0} alignment_lines_found = 0 current_ref_id = None for line in open(args.coords_file, 'r'): cols = line.split() if len(cols) == 11: alignment_lines_found += 1 else: continue cols[0] = int(cols[0]) cols[1] = int(cols[1]) cols[2] = int(cols[2]) cols[3] = int(cols[3]) if cols[9] != current_ref_id: if current_ref_id is not None: if current_ref_id in annot: calculate_gene_coverage_fragments(annot[current_ref_id], query_fragments) calculate_fragment_coverage(current_ref_id, query_fragments, current_ref_length, ref_cov_stats, refcov_stats_ofh, refext_list_ofh) ## reset current_ref_id = cols[9] current_ref_length = int(cols[7]) query_fragments = [] ## quick sanity check if current_ref_id not in annot: print( "WARNING: found a nucleotide accession for which we have no annotation: {0}" .format(current_ref_id)) qstrand = 1 if cols[2] > cols[3]: qstrand = -1 fragment = {} fragment['id'] = cols[10] fragment['qfmin'] = min(cols[2], cols[3]) - 1 fragment['qfmax'] = max(cols[2], cols[3]) fragment['qlen'] = int(cols[8]) fragment['qstrand'] = qstrand fragment['rfmin'] = min(cols[0], cols[1]) - 1 fragment['rfmax'] = max(cols[0], cols[1]) fragment['rlen'] = int(cols[7]) fragment['pctid'] = float(cols[6]) query_fragments.append(fragment) ## don't forget the last one if current_ref_id is not None: if current_ref_id in annot: calculate_gene_coverage_fragments(annot[current_ref_id], query_fragments) calculate_fragment_coverage(current_ref_id, query_fragments, current_ref_length, ref_cov_stats, refcov_stats_ofh, refext_list_ofh) if alignment_lines_found == 0: raise Exception("ERROR: failed to find any 11-column alignment lines") else: print("INFO: {0} alignment lines found".format(alignment_lines_found)) report_gene_coverage_results(annot, genecov_stats_ofh, genesmissing_list_ofh, genecov_tab_ofh) cov_perc = (ref_cov_stats['n_cov'] / ref_n_total) * 100 cov_perc_id = (ref_cov_stats['n_identical'] / ref_n_total) * 100 refmol_stats_ofh.write( "Total bases in reference molecules\t{0}\n".format(ref_n_total)) refmol_stats_ofh.write( "Ref bases covered by query fragments\t{0}\n".format( ref_cov_stats['n_cov'])) refmol_stats_ofh.write( "Ref % covered by query fragments\t{0:.2f}\n".format(cov_perc)) refmol_stats_ofh.write( "Ref % identity by query fragments\t{0:.2f}\n".format(cov_perc_id))
def main(): ''' This script reports statistics on the areas of a genome where features aren't - introns and intergenic space. Pass a valid GFF3 file (along with FASTA data) and get a report like this: Molecule count: 9 Gene count: 4171 Intergenic space count: 4061 Average intergenic space distance: 361.7 bp Median intergenic space distance: 245 bp Minimum intergenic space distance: 0 bp Maximum intergenic space distance: 6272 bp Intron count: 10533 Intron space count: 989024 Average intron size: 93.9 bp Median intron size: 63 bp Minimum intron size: 2 bp Maximum intron size: 1676 bp Optionally, you can pass the path to a PNG file to be created using the --histogram parameter, which will generate a size distribution histogram with two overlaying plots - one representing the distribution of intergenic region sizes and the other the intron lengths. Because these can often have long tails, you can limit both the Y- and X-axes values with the --ylimit and --xlimit options, respectively. FASTA: If your FASTA isn't embedded at the end of your GFF3 file after a ##FASTA directive you'll need to specify the --fasta option in this script and pass it as a separate file. Definitions: Intergenic space was a little ambiguous to me as I started writing this. Does one count the space from the beginning of the contig until the first gene, or only between them? What about short contigs which have no annotated genes at all? From the Sequence Ontology: SO:0000605: A region containing or overlapping no genes that is bounded on either side by a gene, or bounded by a gene and the end of the chromosome. To my reading, this includes contig ends but not gene-less contigs. To that end, I include the former in intergenic space reporting but include the latter as a separate statistic. Author: Joshua Orvis (jorvis AT gmail) ''' parser = argparse.ArgumentParser( description='Reports statistics of reference gene coverage and extension by aligned RNA-seq transcript data.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-g', '--histogram', type=str, required=False, help='Optional path to a histogram of intron/intergenic space size distribution to be created (PNG)' ) parser.add_argument('-x', '--xlimit', type=int, required=False, help='Use this if you want to limit the X-axis of the histogram (feature length)' ) parser.add_argument('-y', '--ylimit', type=int, required=False, help='Use this if you want to limit the Y-axis of the histogram (feature count)' ) parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_gff3) if args.fasta is not None: seqs = utils.fasta_dict_from_file(args.fasta) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## things to keep stats on and report total_molecule_count = len(assemblies) total_gene_count = 0 ## this number is NOT just the total genes N - 1, since there can be multiple molecules # genes can overlap, etc. total_intergenic_space_count = 0 total_intergenic_space_residues = 0 intergenic_distances = list() total_contig_residues = 0 empty_contig_residues = 0 total_intron_count = 0 total_intron_residues = 0 intron_sizes = list() ############################ ## Calculation section ############################ for asm_id in assemblies: #print("DEBUG: processing assembly: {0}".format(asm_id)) assembly = assemblies[asm_id] genes = sorted(assembly.genes()) total_gene_count += len(genes) previous_gene_loc = None # we should have a length here if assembly.length is None or assembly.length == 0: raise Exception("ERROR: Detected assembly with undefined or 0 length: {0}".format(assembly.id)) if total_gene_count == 0: empty_contig_residues += assembly.length continue total_contig_residues += assembly.length first_gene_loc = None last_gene_loc = None for gene in genes: gene_loc = gene.location_on(assembly) # if this is the first gene, track the number of bases from the start of the molecule here if first_gene_loc is None: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) first_gene_loc = gene_loc if previous_gene_loc is not None: ## skip this gene if it overlaps the previous if gene_loc.fmin < previous_gene_loc.fmax: if gene_loc.fmax > previous_gene_loc.fmax: previous_gene_loc = gene_loc else: total_intergenic_space_count += 1 intergenic_distance = gene_loc.fmin - previous_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) for mRNA in gene.mRNAs(): introns = mRNA.introns( on=assembly ) for intron in sorted(introns): total_intron_count += 1 intron_loc = intron.location_on(assembly) intron_size = intron_loc.fmax - intron_loc.fmin #if intron_size > 0: #print("\tDEBUG: found mRNA:{0} intron {1}-{2} ({3} bp)".format(mRNA.id, intron_loc.fmin, intron_loc.fmax, intron_size)) if intron_size < 0: print("\tWARN: Intron size ({1}) < 0 reported in gene {0}".format(gene.id, intron_size)) intron_sizes.append(intron_size) total_intron_residues += intron_size previous_gene_loc = gene_loc last_gene_loc = previous_gene_loc if last_gene_loc is not None: total_intergenic_space_count += 1 intergenic_distance = assembly.length - last_gene_loc.fmax total_intergenic_space_residues += intergenic_distance intergenic_distances.append(intergenic_distance) if total_intergenic_space_count == 0: avg_intergenic_space_dist = None intergenic_distances = None median_int_space_dist = None else: avg_intergenic_space_dist = total_intergenic_space_residues / total_intergenic_space_count intergenic_distances = sorted(intergenic_distances) median_int_space_dist = intergenic_distances[ int(len(intergenic_distances)/2) ] avg_intron_size = total_intron_residues / total_intron_count intron_sizes = sorted(intron_sizes) median_intron_size = intron_sizes[int(len(intron_sizes)/2)] ############################ ## Reporting section ############################ print("\nMolecule count: {0}".format(total_molecule_count)) print("Gene count: {0}".format(total_gene_count) ) print("\nTotal molecule bases: {0} bp".format(total_contig_residues) ) print("Empty molecule bases: {0} bp".format(empty_contig_residues) ) if total_intergenic_space_count > 0: print("Intergenic space count: {0}".format(total_intergenic_space_count) ) print("Average intergenic space distance: {0:.1f} bp".format(avg_intergenic_space_dist) ) print("Median intergenic space distance: {0} bp".format(median_int_space_dist) ) print("Minimum intergenic space distance: {0} bp".format(intergenic_distances[0]) ) print("Maximum intergenic space distance: {0} bp\n".format(intergenic_distances[-1]) ) else: print("There were no intergenic spaces found. This might mean there were no molecules with at least 2 genes.") print("Intron count: {0}".format(total_intron_count) ) print("Intron space count: {0}".format(total_intron_residues) ) print("Average intron size: {0:.1f} bp".format(avg_intron_size) ) print("Median intron size: {0} bp".format(median_intron_size) ) print("Minimum intron size: {0} bp".format(intron_sizes[0]) ) print("Maximum intron size: {0} bp\n".format(intron_sizes[-1]) ) ############################ ## Graphics section (optional) ############################ if args.histogram is not None: import matplotlib.pyplot as plt plt.xlabel('length (bp)') plt.ylabel('count') plt.title('Distribution of intron size and intergenic distances') plt.hist(intergenic_distances, bins=50, histtype='stepfilled', color='b', label='Intergenic distances' ) plt.hist(intron_sizes, bins=50, histtype='stepfilled', color='r', alpha=0.5, label='Intron sizes' ) if args.xlimit is not None: plt.xlim([0, args.xlimit]) if args.ylimit is not None: plt.ylim([0, args.ylimit]) plt.legend(loc='best') plt.savefig(args.histogram)
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' ) parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against. If this is passed, you should also pass the -fo argument' ) parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' ) args = parser.parse_args() seqs = dict() if args.fasta_in is not None: seqs = utils.fasta_dict_from_file(args.fasta_in) if args.fasta_out is not None: out_fh = open(args.fasta_out, 'w') else: raise Exception("ERROR: You must pass a value for -fo if you pass -fi") total_read_mappings = 0 last_transcript_id = None counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } transcript_count = 0 correct_orientation_count = 0 incorrect_orientation_count = 0 transcripts_to_correct = dict() for line in open(args.sam_file): if line.startswith('@'): continue cols = line.split("\t") if len(cols) < 5: continue read_dir = cols[0][-1] transcript_id = cols[2] total_read_mappings += 1 flag = cols[1] if int(flag) & 16: seq_revcomped = 'T' else: seq_revcomped = 'F' #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id)) if transcript_id == last_transcript_id: counts[read_dir][seq_revcomped] += 1 else: transcript_count += 1 if last_transcript_id is not None: ## determine transcript orientation ## Given an RF library, the 1:T count should outnumber the 1:F one if counts['1']['T'] > counts['1']['F']: correct_orientation_count += 1 else: incorrect_orientation_count += 1 transcripts_to_correct[last_transcript_id] = 1 ## report counts print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F'])) ## reset last_transcript_id = transcript_id counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} } for seq_id in seqs: seq = seqs[seq_id] if seq_id in transcripts_to_correct: seq['s'] = utils.reverse_complement(seq['s']) out_fh.write(">{0} {2}\n{1}\n".format(seq_id, utils.wrapped_fasta(seq['s']), seq['h'])) print("Total transcripts: {0}".format(transcript_count)) print("Total reads mapped: {0}".format(total_read_mappings)) print("Transcripts in correct orientation: {0}".format(correct_orientation_count)) print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
def main(): parser = argparse.ArgumentParser( description='Provides coverage information for features in a GFF3 file' ) ## output file to be written parser.add_argument( 'evidence_files', metavar='N', type=str, nargs='+', help='Path to one or more evidence files, separated by spaces') parser.add_argument( '-r', '--reference', type=str, required=True, help= 'Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE' ) parser.add_argument('-f', '--fasta', type=str, required=True, help='Input path to the reference FASTA file.') parser.add_argument( '-o', '--output_file', type=str, required=False, help= 'Optional path to an output file to be created, else prints on STDOUT') args = parser.parse_args() ## parse the fasta fasta = utils.fasta_dict_from_file(args.fasta) ## open the output file fout = None if args.output_file is None: fout = codecs.getwriter('utf8')(sys.stdout.buffer) else: fout = open(args.output_file, "w") #################################################### ## Sanity checks allowed_extensions = ['bed', 'gff3', 'pileup', 'sam'] for ev_file in args.evidence_files: valid_ext_found = False for ext in allowed_extensions: if ev_file.endswith(ext): valid_ext_found = True if valid_ext_found == False: raise Exception( "ERROR: Evidence file passed with unsupported file extension: {0}. Supported extensions are {1}" .format(ev_file, allowed_extensions)) ## The input file should be defined as $path:$feattype if ':' not in args.reference: raise Exception( "ERROR: input_file must be like /path/to/some.gff3:mRNA") ref_file_parts = args.reference.split(':') print("DEBUG: part count: {0}".format(len(ref_file_parts))) if ref_file_parts[0].endswith('.gff3'): (ref_assemblies, ref_features) = gff.get_gff3_features(ref_file_parts[0]) else: raise Exception( "ERROR: Expected input file (-i) to have a gff3 extension, got {0}" .format(ref_file_parts[0])) #################################################### ## Initialize the coverage arrays fasta_cov = dict() for seq_id in fasta: # create a list of 0s the length of the molecule fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s']) #################################################### ## Now parse the evidence files for ev_file in args.evidence_files: if ev_file.endswith('pileup'): parse_pileup(fasta_cov, ev_file) elif ev_file.endswith('sam'): parse_sam(fasta_cov, ev_file) else: print( "INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented" .format(ev_file)) for id in fasta_cov: covered_bases = 0 for i in fasta_cov[id]: if fasta_cov[id][i] > 0: covered_bases += 1 fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']), covered_bases))
def main(): parser = argparse.ArgumentParser( description= 'Reports on non-standard characters in multifasta files and can optionally replace residues' ) parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-t', '--type', type=str, required=True, choices=('n', 'p'), help='Either n for nucleotide or p for protein') parser.add_argument( '-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created if doing replacement') parser.add_argument( '-pl', '--print_locations', dest='print_locations', action='store_true', help= 'If passed, will report coordinate of each non-standard residue on STDERR' ) parser.add_argument( '-r', '--replace', type=str, required=False, help='Replace this character with the one defined by --with_') parser.add_argument( '-w', '--with_', type=str, required=False, help= 'This character or set replaces all instances of the one found in --replace' ) parser.add_argument( '-l', '--list', type=str, required=False, help= 'Optional file of IDs where non-standard residues were detected or replaced' ) parser.add_argument( '-g', '--ignore', type=str, required=False, default='N*X', help= 'List of characters to not report as non-standard. Default = the universal ambiguity bases (N, X) or the end-of-translation stop for proteins (*)' ) parser.set_defaults(print_locations=False) args = parser.parse_args() if args.output is None: out_fh = sys.stdout else: out_fh = open(args.output, 'wt') ## if you define --replace, you must also define --with_, and vice versa if args.replace is not None and args.with_ is None: raise Exception("ERROR: You must pass --with_ when passing --replace") if args.with_ is not None and args.replace is None: raise Exception("ERROR: You must pass --replace when passing --with_") seqs = utils.fasta_dict_from_file(args.input) ## standard characters (depends on the type of sequence) standard_residues = dict() if args.type == 'n': for base in list("ATGCU"): standard_residues[base] = 1 else: for base in list("ACDEFGHIKLMNPQRSTVWY"): standard_residues[base] = 1 if args.list is not None: list_fh = open(args.list, 'wt') ## build the lookup of characters to ignore ignore_residues = dict() for residue in list(args.ignore): ignore_residues[residue.upper()] = None ## process the sequences seqs_with_bad_chars = dict() for seq_id in seqs: i = 0 seq = seqs[seq_id] bad_chars = dict() for base in list(seq['s']): i += 1 ubase = base.upper() if ubase not in standard_residues and ubase not in ignore_residues: if ubase in bad_chars: bad_chars[ubase] += 1 else: bad_chars[ubase] = 1 if args.print_locations == True: print("Molecule {0} contains residue {1} at position {2}". format(seq_id, ubase, i), file=sys.stderr) if args.list is not None and len(bad_chars) > 0: list_fh.write("{0}".format(seq_id)) for base in bad_chars: list_fh.write("\t{0}:{1}".format(base, bad_chars[base])) list_fh.write("\n") if args.replace is not None: seq['s'] = seq['s'].replace(args.replace, args.with_) out_fh.write(">{0} {1}\n".format(seq_id, seq['h'])) for i in range(0, len(seq['s']), 60): out_fh.write(seq['s'][i:i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description='Replaces long homopolymeric stretches with N characters') parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument( '-hll', '--homopolymer_length_limit', type=int, required=True, help= 'Stretches of non-N residues longer than this will be replaced with Ns' ) args = parser.parse_args() if args.output is None: out_fh = sys.stdout else: out_fh = open(args.output, 'wt') sys.stderr.write("INFO: Parsing input FASTA\n") sys.stderr.flush() seqs = utils.fasta_dict_from_file(args.input) sys.stderr.write("INFO: Looking for homopolymeric runs > {0} bp\n".format( args.homopolymer_length_limit)) sys.stderr.flush() for seq_id in seqs: seq = seqs[seq_id] current_seq = seq['s'] current_homopolymer_base = None current_homopolymer_length = 0 current_homopolymer_start_idx = 0 base_index = 0 for base in list(seq['s']): if base == current_homopolymer_base: current_homopolymer_length += 1 else: if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write( "WARNING: Replacing {3} bp of {2}s in Sequence ID {0} starting at position {1}\n" .format(seq_id, current_homopolymer_start_idx + 1, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format( seq['s'][0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, seq['s'][base_index:]) current_homopolymer_base = base current_homopolymer_length = 1 current_homopolymer_start_idx = base_index base_index += 1 ## check after the last row for any runs which terminate the sequence if current_homopolymer_length > args.homopolymer_length_limit and current_homopolymer_base != 'N': sys.stderr.write( "WARNING: Replacing {3} bp of {2} bases in Sequence ID {0} starting at position {1}\n" .format(seq_id, current_homopolymer_start_idx, current_homopolymer_base, current_homopolymer_length)) sys.stderr.flush() current_seq = "{0}{1}{2}".format( current_seq[0:current_homopolymer_start_idx], 'N' * current_homopolymer_length, current_seq[base_index:]) seqs[seq_id]['s'] = current_seq out_fh.write(">{0} {1}\n".format(seq_id, seqs[seq_id]['h'])) out_fh.write(utils.wrapped_fasta(seqs[seq_id]['s'])) out_fh.write("\n")
def main(): parser = argparse.ArgumentParser( description='') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created' ) parser.add_argument('-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # set this to None if you don't want the debug print statements #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C' debugging_gene = None if args.fasta is not None: seqs = utils.fasta_dict_from_file(args.fasta) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') for assembly_id in assemblies: assembly = assemblies[assembly_id] for gene in assembly.genes(): if debugging_gene is not None: debug_mode = True if gene.id != debugging_gene: continue else: debug_mode = False if gene.locus_tag is None: gene_label = gene.id else: gene_label = gene.locus_tag gene_seq = gene.get_residues().upper() gene_loc = gene.location_on(assembly) ## we have to do this here because of the coordinates if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) if debug_mode: print("INFO: Processing gene with length {0} at {1}-{2}".format(len(gene_seq), gene_loc.fmin, gene_loc.fmax)) if len(gene.mRNAs()) > 1: #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id)) print("ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)".format(gene.id)) continue for mRNA in gene.mRNAs(): introns = mRNA.introns( on=assembly ) # this helps us get where the intron is on the gene offset = gene_loc.fmin for intron in introns: intron_loc = intron.location_on(assembly) lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower() gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[intron_loc.fmax - offset:] if debug_mode: print("INFO:\tfound intron at {0}-{1}".format(intron_loc.fmin, intron_loc.fmax)) print("INFO:\tlower-casing offset adjusted coordinates: {0}-{1}".format(intron_loc.fmin - offset, intron_loc.fmax - offset)) print("INFO:\tgenerating lower case seq of length: {0}\n".format(len(lower_mid)) ) if debug_mode: print("INFO: seq length before CDS processing is: {0}".format(len(gene_seq))) ## do we need to trim down to the CDS range? if args.type == 'CDS': CDSs = sorted(mRNA.CDSs()) CDS_min = CDSs[0].location_on(assembly).fmin CDS_max = CDSs[-1].location_on(assembly).fmax if debug_mode: print("INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}".format(CDS_max, CDS_min, CDS_max - CDS_min)) if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max: fmin_chomp = CDS_min - offset fmax_chomp = gene_loc.fmax - CDS_max if debug_mode: print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \ gene_loc.fmax, gene_loc.strand, \ CDS_min, CDS_max \ )) print("\tfmin_chomp:{0}, fmax_chomp:{1}".format(fmin_chomp, fmax_chomp)) print("\tpulling range: gene_seq[{0} : {1}]".format(fmin_chomp, len(gene_seq) - fmax_chomp)) gene_seq = gene_seq[fmin_chomp : len(gene_seq) - fmax_chomp] if debug_mode: print("\tGene {0} CDS seq: {1}".format(gene.id, gene_seq)) ## make sure to switch it back if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) #print("INFO: Got gene with length {0} after modification".format(len(gene_seq))) ofh.write(">{0}\n{1}\n".format(gene_label, utils.wrapped_fasta(gene_seq)))
def main(): parser = argparse.ArgumentParser(description='') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') parser.add_argument( '-f', '--fasta', type=str, required=False, help='Required if you don\'t have GFF3 with embedded FASTA') parser.add_argument('-t', '--type', type=str, required=False, default='mRNA', choices=['mRNA', 'CDS'], help='Feature type to export (mRNA or CDS)') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # set this to None if you don't want the debug print statements #debugging_gene = 'D9AE6116893A0D5711D56C0F1E6CF58C' debugging_gene = None if args.fasta is not None: seqs = utils.fasta_dict_from_file(args.fasta) for seq_id in seqs: if seq_id in assemblies: assemblies[seq_id].residues = seqs[seq_id]['s'] assemblies[seq_id].length = len(assemblies[seq_id].residues) ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') for assembly_id in assemblies: assembly = assemblies[assembly_id] for gene in assembly.genes(): if debugging_gene is not None: debug_mode = True if gene.id != debugging_gene: continue else: debug_mode = False if gene.locus_tag is None: gene_label = gene.id else: gene_label = gene.locus_tag gene_seq = gene.get_residues().upper() gene_loc = gene.location_on(assembly) ## we have to do this here because of the coordinates if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) if debug_mode: print( "INFO: Processing gene with length {0} at {1}-{2}".format( len(gene_seq), gene_loc.fmin, gene_loc.fmax)) if len(gene.mRNAs()) > 1: #raise Exception("ERROR: script doesn't currently support multi-isoform genes, but found one: {0}".format(gene.id)) print( "ERROR: skipping gene {0} because it appears to have multiple isoforms (not currently supported)" .format(gene.id)) continue for mRNA in gene.mRNAs(): introns = mRNA.introns(on=assembly) # this helps us get where the intron is on the gene offset = gene_loc.fmin for intron in introns: intron_loc = intron.location_on(assembly) lower_mid = gene_seq[intron_loc.fmin - offset:intron_loc.fmax - offset].lower() gene_seq = gene_seq[0:intron_loc.fmin - offset] + lower_mid + gene_seq[ intron_loc.fmax - offset:] if debug_mode: print("INFO:\tfound intron at {0}-{1}".format( intron_loc.fmin, intron_loc.fmax)) print( "INFO:\tlower-casing offset adjusted coordinates: {0}-{1}" .format(intron_loc.fmin - offset, intron_loc.fmax - offset)) print( "INFO:\tgenerating lower case seq of length: {0}\n" .format(len(lower_mid))) if debug_mode: print("INFO: seq length before CDS processing is: {0}". format(len(gene_seq))) ## do we need to trim down to the CDS range? if args.type == 'CDS': CDSs = sorted(mRNA.CDSs()) CDS_min = CDSs[0].location_on(assembly).fmin CDS_max = CDSs[-1].location_on(assembly).fmax if debug_mode: print( "INFO: Calculated CDS range, with introns, should be: {0}-{1}={2}" .format(CDS_max, CDS_min, CDS_max - CDS_min)) if gene_loc.fmin != CDS_min or gene_loc.fmax != CDS_max: fmin_chomp = CDS_min - offset fmax_chomp = gene_loc.fmax - CDS_max if debug_mode: print("gene:{0} coords:{1}-{2} ({3}), CDS coords: {4}-{5}".format(gene.id, gene_loc.fmin, \ gene_loc.fmax, gene_loc.strand, \ CDS_min, CDS_max \ )) print("\tfmin_chomp:{0}, fmax_chomp:{1}".format( fmin_chomp, fmax_chomp)) print( "\tpulling range: gene_seq[{0} : {1}]".format( fmin_chomp, len(gene_seq) - fmax_chomp)) gene_seq = gene_seq[fmin_chomp:len(gene_seq) - fmax_chomp] if debug_mode: print("\tGene {0} CDS seq: {1}".format( gene.id, gene_seq)) ## make sure to switch it back if gene_loc.strand == -1: gene_seq = "".join(reversed(gene_seq)) #print("INFO: Got gene with length {0} after modification".format(len(gene_seq))) ofh.write(">{0}\n{1}\n".format(gene_label, utils.wrapped_fasta(gene_seq)))
def load_from_file(self, file): seqs = utils.fasta_dict_from_file(file) for seq_id in seqs: polypeptide = Polypeptide(id=seq_id, residues=seqs[seq_id]['s']) self.add(polypeptide)
def main(): parser = argparse.ArgumentParser( description='Provides coverage information for features in a GFF3 file') ## output file to be written parser.add_argument('evidence_files', metavar='N', type=str, nargs='+', help='Path to one or more evidence files, separated by spaces' ) parser.add_argument('-r', '--reference', type=str, required=True, help='Input path to the reference GFF3 file. So we know what feature type to report on, format should be like FILE:TYPE' ) parser.add_argument('-f', '--fasta', type=str, required=True, help='Input path to the reference FASTA file.' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional path to an output file to be created, else prints on STDOUT' ) args = parser.parse_args() ## parse the fasta fasta = utils.fasta_dict_from_file(args.fasta) ## open the output file fout = None if args.output_file is None: fout = codecs.getwriter('utf8')(sys.stdout.buffer) else: fout = open(args.output_file, "w") #################################################### ## Sanity checks allowed_extensions = ['bed', 'gff3', 'pileup', 'sam'] for ev_file in args.evidence_files: valid_ext_found = False for ext in allowed_extensions: if ev_file.endswith(ext): valid_ext_found = True if valid_ext_found == False: raise Exception("ERROR: Evidence file passed with unsupported file extension: {0}. Supported extensions are {1}".format(ev_file, allowed_extensions)) ## The input file should be defined as $path:$feattype if ':' not in args.reference: raise Exception("ERROR: input_file must be like /path/to/some.gff3:mRNA") ref_file_parts = args.reference.split(':') print("DEBUG: part count: {0}".format(len(ref_file_parts))) if ref_file_parts[0].endswith('.gff3'): (ref_assemblies, ref_features) = gff.get_gff3_features(ref_file_parts[0]) else: raise Exception("ERROR: Expected input file (-i) to have a gff3 extension, got {0}".format(ref_file_parts[0])) #################################################### ## Initialize the coverage arrays fasta_cov = dict() for seq_id in fasta: # create a list of 0s the length of the molecule fasta_cov[seq_id] = [0] * len(fasta[seq_id]['s']) #################################################### ## Now parse the evidence files for ev_file in args.evidence_files: if ev_file.endswith('pileup'): parse_pileup(fasta_cov, ev_file) elif ev_file.endswith('sam'): parse_sam(fasta_cov, ev_file) else: print("INFO: ignoring evidence file {0} because code to handle its file type isn't currently implemented".format(ev_file)) for id in fasta_cov: covered_bases = 0 for i in fasta_cov[id]: if fasta_cov[id][i] > 0: covered_bases += 1 fout.write("{0}\t{1}\t{2}\n".format(id, len(fasta[id]['s']), covered_bases))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') parser.add_argument('-a', '--organism1_annotation', type=str, required=True, help='Annotation GFF for organism 1') parser.add_argument('-p', '--organism1_aat_alignments', type=str, required=True, help='Path to AAT GFF3 (match/match_part)') parser.add_argument('-aatdb', '--aat_fasta_db', type=str, required=True, help='Path to FASTA database that was used in AAT') parser.add_argument('-b', '--organism1_blast_alignments', type=str, required=True, help='Path to BLASTp btab file vs.organism 2 proteins') parser.add_argument('-be', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='BLAST e-value cutoff') parser.add_argument('-bpi', '--blast_percent_identity_cutoff', type=float, required=False, default=0, help='BLAST %identity cutoff') parser.add_argument( '-ppc', '--aat_percent_coverage_cutoff', type=float, required=False, default=0, help='% coverage of the query protein by the AAT match') parser.add_argument('-o', '--output_id_list', type=str, required=False, help='List of IDs from organism1 that passed') args = parser.parse_args() debugging_transcript = None ## if the output file wasn't passed build one from the other parameters if args.output_id_list is None: args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format( args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff) print("INFO: Parsing organism1 annotation") (assemblies, features) = gff.get_gff3_features(args.organism1_annotation) print("INFO: Parsing AAT FASTA database") aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db) # keys are assembly IDs, value for each is a list of matches on them aat_matches = dict() aat_match_count = 0 current_match = None ## IDs of features in organism 1 which overlap AAT o1_with_aat = list() o1_with_o2 = list() print("INFO: Parsing organism1 AAT protein alignments") for line in open(args.organism1_aat_alignments): cols = line.split("\t") if line.startswith('#') or len(cols) != 9: continue assembly_id = cols[0] # skip this match if there were not predicted genes on the same assembly if assembly_id not in assemblies: continue if assembly_id not in aat_matches: aat_matches[assembly_id] = list() fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] feature_id = gff.column_9_value(cols[8], 'ID').replace('"', '') target = gff.column_9_value(cols[8], 'Target') m = re.search("^(\S+)", target) if m: target = m.group(1) if cols[2] == 'nucleotide_to_protein_match': if current_match is not None: aat_matches[assembly_id].append(current_match) aat_match_count += 1 current_match = things.Match( id=feature_id, target_id=target, subclass='nucleotide_to_protein_match', length=fmax - fmin) current_match.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) elif cols[2] == 'match_part': parent_id = gff.column_9_value(cols[8], 'Parent').replace('"', '') match_part = things.MatchPart(id=feature_id, parent=parent_id, length=fmax - fmin) match_part.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand) current_match.add_part(match_part) print("INFO: Parsed {0} protein alignment chains".format(aat_match_count)) print("INFO: Comparing organism1's mRNAs with AAT match coordinates") for assembly_id in assemblies: if assembly_id not in aat_matches: continue assembly = assemblies[assembly_id] for gene in assembly.genes(): for mRNA in gene.mRNAs(): if debugging_transcript is not None: if mRNA.id == debugging_transcript: print("DEBUG: processing debugging transcript: {0}". format(mRNA.id)) else: continue for aat_match in aat_matches[assembly_id]: #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) ) overlap_size = mRNA.overlap_size_with(aat_match) if overlap_size is not None: #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) ) # this shouldn't be possible, but check just in case if overlap_size > mRNA.length: raise Exception( "ERROR: overlap size ({0}) > mRNA length ({1})" .format(overlap_size, mRNA.length)) if aat_match.target_id not in aat_seqs: raise Exception( "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb" .format(aat_match.target_id)) # this is a protein length, so x3 match_target_length = len( aat_seqs[aat_match.target_id]['s']) * 3 (mRNA_percent_coverage, target_percent_coverage ) = calculate_fragmented_coverage( mRNA, aat_match, match_target_length) #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) ) #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) ) if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff: o1_with_aat.append(mRNA.id) #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \ # mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \ # aat_match.target_id, match_target_length) ) #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage)) #print("\ttarget % cov: {0}".format(target_percent_coverage)) break # only need to see if one matched print( "INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates". format(len(o1_with_aat))) # key=org1_transcript_id, value=org2_transcript_id top_blast_hits = dict() print("INFO: parsing BLAST results vs. org2") for line in open(args.organism1_blast_alignments): cols = line.split("\t") if float(cols[19]) > args.blast_eval_cutoff: continue if float(cols[10]) < args.blast_percent_identity_cutoff: continue # if we survived until here, this one's good. top_blast_hits[cols[0]] = cols[5] print( "INFO: Comparing overlap between AAT-matched proteins and BLAST ones") for o1_mRNA_id in o1_with_aat: if o1_mRNA_id in top_blast_hits: o1_with_o2.append(o1_mRNA_id) print( "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2" .format(len(o1_with_o2))) id_list_fh = open(args.output_id_list, 'wt') for mRNA_id in o1_with_o2: id_list_fh.write("{0}\n".format(mRNA_id))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' ) args = parser.parse_args() ratios = list() # if set to true, the IDs in the mid-range will be printed to STDOUT print_ids = True RATIO_MIN = 0.05 RATIO_MAX = 0.95 #RATIO_MIN = 0.125 #RATIO_MAX = 0.875 #RATIO_MIN = 0.25 #RATIO_MAX = 0.75 #RATIO_MIN = 0.475 #RATIO_MAX = 0.525 LENGTH_CUTOFF = 350 ratio_min_count = 0 ratio_bet_count = 0 ratio_max_count = 0 fasta = utils.fasta_dict_from_file(args.fasta_file) for line in open(args.input_file): # lines are like: comp0_c0_seq1 1-T:6 1-F:0 2-T:0 2-F:5 m = re.search('(.+)\t1-T:(\d+)\t1-F:(\d+)\t2-T:(\d+)\t2-F:(\d+)', line) if m: seq_id = m.group(1) if seq_id in fasta: if len(fasta[seq_id]['s']) < LENGTH_CUTOFF: continue else: raise Exception("Expected but filed to find seq ID {0} in FASTA file".format(seq_id)) f_reads_correctly_mapped = int(m.group(2)) f_reads_incorrectly_mapped = int(m.group(3)) r_reads_correctly_mapped = int(m.group(5)) r_reads_incorrectly_mapped = int(m.group(4)) f_read_count = f_reads_correctly_mapped + f_reads_incorrectly_mapped if f_read_count > 0: correct_ratio = f_reads_correctly_mapped / f_read_count ratios.append(correct_ratio) if correct_ratio < RATIO_MIN: ratio_min_count += 1 elif correct_ratio > RATIO_MAX: ratio_max_count += 1 else: ratio_bet_count += 1 if print_ids == True: print(seq_id) #print("LOG: Fcorrect:{0} Fwrong:{1} Ftotal:{2} ratio:{3}".format(f_reads_correctly_mapped, f_reads_incorrectly_mapped, f_read_count, correct_ratio)) plt.hist(ratios, bins=100) plt.xlabel("Correct read orientation alignment ratio") plt.ylabel("Log of transcript count") plt.grid(True) #plt.ylim(0,5000) plt.gca().set_yscale("log") plt.savefig(args.output_file) sys.stderr.write("Count of ratios < {0}: {1}\n".format(RATIO_MIN, ratio_min_count)) sys.stderr.write("Count where {0} > ratio < {1}: {2}\n".format(RATIO_MIN, RATIO_MAX, ratio_bet_count)) sys.stderr.write("Count of ratios > {0}: {1}\n".format(RATIO_MAX, ratio_max_count))
def load_from_file(self, file): seqs = utils.fasta_dict_from_file(file) for seq_id in seqs: mRNA = mRNA(id=seq_id, residues=seqs[seq_id]['s']) self.add(mRNA)
def main(): parser = argparse.ArgumentParser( description='Use BLAST to identify internal inverted repeats') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-n', '--min_repeat_size', type=int, required=True, help='Minimum size of a repeat to consider' ) parser.add_argument('-pid', '--percent_identity', type=float, required=False, default=98.0, help='Percent identity cutoff' ) args = parser.parse_args() # parse FASTA input, storing into a dict keyed by ID seqs = utils.fasta_dict_from_file(args.input_file) ofh = open(args.output_file, 'wt') seqs_processed = 0 print_interval = 100 for id in seqs: # Write a FASTA of just this sequence fasta_name = "{0}.temp.input.fasta".format(os.getpid()) blast_name = "{0}.temp.blast.out".format(os.getpid()) fasta_fh = open(fasta_name, 'wt') fasta_fh.write(">{0}\n{1}".format(id, seqs[id]['s'])) fasta_fh.close() # Perform the blast using bl2seq #cmd = "bl2seq -i {0} -j {0} -p blastn -e 1e-10 -D 1 -o {1} -W {2}".format(fasta_name, blast_name, args.min_repeat_size) cmd = "blastn -query {0} -subject {0} -outfmt 6 -out {1} -word_size {2} -perc_identity {3}".format(fasta_name, blast_name, args.min_repeat_size, args.percent_identity) run_command(cmd) # Parse the result file to look for inverted repeats for line in open(blast_name): if line.startswith('#'): continue cols = line.split() qstart, qend, sstart, send = int(cols[6]), int(cols[7]), int(cols[8]), int(cols[9]) if qstart < qend: q_orientation = 'F' match_len = qend - qstart + 1 else: q_orientation = 'R' match_len = qstart - qend + 1 if sstart < send: s_orientation = 'F' else: s_orientation = 'R' if s_orientation != q_orientation and match_len >= args.min_repeat_size: ofh.write("INVERSION of {5} bp in {4}: {0}\t{1}\t{2}\t{3}\n".format(qstart, qend, sstart, send, cols[0], match_len)) if s_orientation == q_orientation and match_len >= args.min_repeat_size: if (qstart >= sstart and qstart <= send) or (qend >= sstart and qend <= send): pass else: ofh.write("DIRECT REPEAT of {5} bp in {4}: {0}\t{1}\t{2}\t{3}\n".format(qstart, qend, sstart, send, cols[0], match_len)) #ofh.write("# ^^ {0}".format(line)) seqs_processed += 1 if seqs_processed % print_interval == 0: print("INFO: processed {0} input sequences".format(seqs_processed)) ofh.close()
def main(): parser = argparse.ArgumentParser( description='Splits FASTA file based on reported coverage gaps') ## output file to be written parser.add_argument('-g', '--gaps_file', type=str, required=True, help='Path to an input gaps file to be read') parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file to be read') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') parser.add_argument( '-mfl', '--min_fragment_length', type=int, required=False, help='Min length required for a fragment to be exported') parser.add_argument('-mgl', '--min_gap_length', type=int, required=False, help='Ignore gaps reported under this min length') args = parser.parse_args() fasta = utils.fasta_dict_from_file(args.fasta_file) # this is just to keep track of which we've exported molecules_split = dict() ofh = open(args.output_file, 'wt') last_molecule_id = None last_end_coordinate = None for line in open(args.gaps_file): cols = line.split('\t') mol_id, start, stop = cols[0], int(cols[1]), int(cols[2]) # skip if this is too short if args.min_gap_length is not None and (stop - start + 1) < args.min_gap_length: #print("DEBUG: skipping short gap {0} : {1}-{2}".format(mol_id, start, stop)) continue if last_molecule_id is None: # first entry, export only beginning of molecule to gap start export_fragment(ofh, fasta, mol_id, 1, start - 1, args.min_fragment_length, molecules_split) last_molecule_id = mol_id last_end_coordinate = stop elif mol_id != last_molecule_id: # new molecule, export end of last molecule last_molecule_length = len(fasta[last_molecule_id]['s']) export_fragment(ofh, fasta, last_molecule_id, last_end_coordinate + 1, last_molecule_length, args.min_fragment_length, molecules_split) # now export the beginning of this one unless the start is 1 if start != 1: export_fragment(ofh, fasta, mol_id, 1, start - 1, args.min_fragment_length, molecules_split) last_molecule_id = mol_id last_end_coordinate = stop else: # same molecule as we just saw, but new entry for it # export end of last gap until beginning of this one export_fragment(ofh, fasta, mol_id, last_end_coordinate + 1, start - 1, args.min_fragment_length, molecules_split) last_molecule_id = mol_id last_end_coordinate = stop # Now export the full sequences of any which weren't split for id in fasta: if id not in molecules_split: ofh.write(">{0}\n{1}\n".format(id, utils.wrapped_fasta( fasta[id]['s'])))
def main(): parser = argparse.ArgumentParser( description='Reports on non-standard characters in multifasta files and can optionally replace residues') parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-t', '--type', type=str, required=True, choices=('n', 'p'), help='Either n for nucleotide or p for protein') parser.add_argument('-o', '--output', type=str, required=False, help='Path to an output FASTA file to be created if doing replacement' ) parser.add_argument('-pl', '--print_locations', dest='print_locations', action='store_true', help='If passed, will report coordinate of each non-standard residue on STDERR' ) parser.add_argument('-r', '--replace', type=str, required=False, help='Replace this character with the one defined by --with_' ) parser.add_argument('-w', '--with_', type=str, required=False, help='This character or set replaces all instances of the one found in --replace' ) parser.add_argument('-l', '--list', type=str, required=False, help='Optional file of IDs where non-standard residues were detected or replaced' ) parser.add_argument('-g', '--ignore', type=str, required=False, default='N*X', help='List of characters to not report as non-standard. Default = the universal ambiguity bases (N, X) or the end-of-translation stop for proteins (*)' ) parser.set_defaults(print_locations=False) args = parser.parse_args() if args.output is None: out_fh = sys.stdout else: out_fh = open( args.output, 'wt' ) ## if you define --replace, you must also define --with_, and vice versa if args.replace is not None and args.with_ is None: raise Exception("ERROR: You must pass --with_ when passing --replace") if args.with_ is not None and args.replace is None: raise Exception("ERROR: You must pass --replace when passing --with_") seqs = utils.fasta_dict_from_file(args.input) ## standard characters (depends on the type of sequence) standard_residues = dict() if args.type == 'n': for base in list("ATGCU"): standard_residues[base] = 1 else: for base in list("ACDEFGHIKLMNPQRSTVWY"): standard_residues[base] = 1 if args.list is not None: list_fh = open(args.list, 'wt') ## build the lookup of characters to ignore ignore_residues = dict() for residue in list(args.ignore): ignore_residues[residue.upper()] = None ## process the sequences seqs_with_bad_chars = dict() for seq_id in seqs: i = 0 seq = seqs[seq_id] bad_chars = dict() for base in list(seq['s']): i += 1 ubase = base.upper() if ubase not in standard_residues and ubase not in ignore_residues: if ubase in bad_chars: bad_chars[ubase] += 1 else: bad_chars[ubase] = 1 if args.print_locations == True: print("Molecule {0} contains residue {1} at position {2}".format(seq_id, ubase, i), file=sys.stderr) if args.list is not None and len(bad_chars) > 0: list_fh.write("{0}".format(seq_id)) for base in bad_chars: list_fh.write( "\t{0}:{1}".format(base, bad_chars[base]) ) list_fh.write("\n") if args.replace is not None: seq['s'] = seq['s'].replace(args.replace, args.with_) out_fh.write( ">{0} {1}\n".format(seq_id, seq['h']) ) for i in range(0, len(seq['s']), 60): out_fh.write(seq['s'][i : i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file') ## output file to be written parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates') parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers') parser.add_argument( '-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions') parser.add_argument( '-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions') parser.add_argument( '-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment') parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') seqs = utils.fasta_dict_from_file(args.fasta_file) start_col = args.start_coord_col - 1 stop_col = args.stop_coord_col - 1 mol_col = args.mol_col - 1 for line in open(args.coords_file): line = line.rstrip() cols = line.split('\t') if len(cols) < 3: continue (fmin, fmax, strand) = utils.humancoords_to_0interbase(int(cols[start_col]), int(cols[stop_col])) mol_id = cols[mol_col] if mol_id not in seqs: raise Exception( "ERROR: molecule ID ({0}) not found in FASTA file".format( mol_id)) seq = seqs[mol_id]['s'][fmin:fmax] seq_id = None if args.name_col is None: seq_id = "{0}___{1}.{2}.{3}".format(mol_id, fmin, fmax, strand) else: seq_id = cols[int(args.name_col) - 1] if strand == -1: seq = utils.reverse_complement(seq) ## write this sequence, 60bp per line fout.write(">{0}\n".format(seq_id)) for i in range(0, len(seq), 60): fout.write(seq[i:i + 60] + "\n")
def main(): parser = argparse.ArgumentParser( description= 'Split multi-FASTA file into separate protein and nucleotide files') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-p', '--protein', type=str, required=False, help='Path to a tab-delimited file with coordinates') parser.add_argument('-n', '--nucleotide', type=str, required=False, help='Tabdel file column with molecule identifiers') parser.add_argument( '-c', '--cutoff', type=str, required=False, default=80, help= 'Min percent (1-100) of ATGCNX content to be considered a nucleotide sequence' ) args = parser.parse_args() pout = nout = None if args.protein is not None: pout = open(args.protein, 'wt') if args.nucleotide is not None: nout = open(args.nucleotide, 'wt') ## the user should have specified at least one if pout is None and nout is None: raise Exception( "ERROR: you must specify either -p or -n options (else why are you running this script?" ) seqs = utils.fasta_dict_from_file(args.input) for seq_id in seqs: seq = seqs[seq_id] seqcomp = nucleotide_composition(seq['s']) seq_wrapped = wrapped(seq['s'], every=60) if seqcomp >= args.cutoff: ## it's a nucleotide if nout is not None: nout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'], seq_wrapped)) else: ## it's a protein if pout is not None: pout.write(">{0} {1}\n{2}\n".format(seq_id, seq['h'], seq_wrapped))
def load_from_file(self, file): seqs = utils.fasta_dict_from_file(file) for seq_id in seqs: assembly = Assembly(id=seq_id, residues=seqs[seq_id]['s']) self.add(assembly)
def main(): parser = argparse.ArgumentParser(description='Merge masked FASTA files') ## output file to be written parser.add_argument('fasta_files', metavar='N', type=str, nargs='+', help='Pass one or more FASTA files') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output file to be created') args = parser.parse_args() ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') files = args.fasta_files # pull off a file and index it seqs = utils.fasta_dict_from_file(files.pop()) # python strings are immutable, so we need to transform these into lists for seq_id in seqs: seqs[seq_id]['s'] = list(seqs[seq_id]['s']) for fasta_file in args.fasta_files: new_seqs = utils.fasta_dict_from_file(fasta_file) for seq_id in new_seqs: # make sure it exists in the source file if seq_id not in seqs: raise Exception( "ERROR: Seq ID {0} was found in file {1} but not in the seed file" .format(seq_id, fasta_file)) # they should also be the same length if len(seqs[seq_id]) != len(new_seqs[seq_id]): raise Exception( "ERROR: Seq ID {0} was found in {1} and the seed file but had different lengths" .format(seq_id, fasta_file)) i = 0 for base in new_seqs[seq_id]['s']: if base != seqs[seq_id]['s'][i]: if base == 'N': seqs[seq_id]['s'][i] = 'N' elif seqs[seq_id]['s'][i] != 'N': print("WARNING: Disagreement {0}-{1} at position {2}". format(base, seqs[seq_id]['s'][i], i)) i += 1 # now done, print out the results for seq_id in seqs: ofh.write(">{0} {1}\n{2}\n".format( seq_id, seqs[seq_id]['h'], utils.wrapped_fasta(''.join(seqs[seq_id]['s']))))
def main(): parser = argparse.ArgumentParser( description='Parses nucmer coords output to provide an overall coverage report') ## coords file generated with: show-coords -l -r -T out.delta parser.add_argument('-c', '--coords_file', type=str, required=True, \ help='Path to a nucmer coords file with non-overlapping results (requires -l -r -T options of show-coords)' ) parser.add_argument('-o', '--output_prefix', type=str, required=True, help='Several output files will be created with this prefix.' ) parser.add_argument('-a', '--annotation_file', type=str, required=True, help='Path to a sorted GFF3 annotation file' ) parser.add_argument('-r', '--reference_fasta', type=str, required=True, help='Path to the reference file used with nucmer' ) parser.add_argument('-k', '--annotation_key', type=str, required=False, help='Optional. Key string to look for in the 9th column of the GFF3 file for an annotation string.' ) args = parser.parse_args() ## like: h[$assem] = [ {id=?,fmin=?,fmax=?}, ... ] annot = parse_annotation( args.annotation_file, args.annotation_key ) ## like: [ {id=?,qfmin=?,qfmax=?,rfmin=?,rfmax=?} ] query_fragments = [] ref_molecules = utils.fasta_dict_from_file(args.reference_fasta) ref_n_total = 0 for ref_id in ref_molecules: ref_n_total += len( ref_molecules[ref_id]['s'] ) ## open the output files genecov_stats_ofh = open(args.output_prefix + ".stats.gene_coverage", "wt") genesmissing_list_ofh = open(args.output_prefix + ".list.genes_missing", "wt") refmol_stats_ofh = open(args.output_prefix + ".stats.refmol_coverage", "wt") refcov_stats_ofh = open(args.output_prefix + ".tab.refmol_coverage", "wt") refext_list_ofh = open(args.output_prefix + ".tab.extensions", "wt") genecov_tab_ofh = open(args.output_prefix + ".tab.gene_coverage", "wt") refext_list_ofh.write("# {0}\n".format(args.output_prefix) ) refext_list_ofh.write("# reference_id\tref_fmin\tref_fmax\tref_strand\tqry_id\tqry_fmin\tqry_fmax\tqry_strand\tqry_length\n"); ref_cov_stats = { 'n_cov': 0, 'n_uncov': 0, 'n_identical': 0 } alignment_lines_found = 0 current_ref_id = None for line in open(args.coords_file, 'r'): cols = line.split() if len(cols) == 11: alignment_lines_found += 1 else: continue cols[0] = int(cols[0]) cols[1] = int(cols[1]) cols[2] = int(cols[2]) cols[3] = int(cols[3]) if cols[9] != current_ref_id: if current_ref_id is not None: if current_ref_id in annot: calculate_gene_coverage_fragments( annot[current_ref_id], query_fragments ) calculate_fragment_coverage( current_ref_id, query_fragments, current_ref_length, ref_cov_stats, refcov_stats_ofh, refext_list_ofh ) ## reset current_ref_id = cols[9] current_ref_length = int(cols[7]) query_fragments = [] ## quick sanity check if current_ref_id not in annot: print("WARNING: found a nucleotide accession for which we have no annotation: {0}".format(current_ref_id)) qstrand = 1 if cols[2] > cols[3]: qstrand = -1 fragment = {} fragment['id'] = cols[10] fragment['qfmin'] = min(cols[2], cols[3]) - 1 fragment['qfmax'] = max(cols[2], cols[3]) fragment['qlen'] = int(cols[8]) fragment['qstrand'] = qstrand fragment['rfmin'] = min(cols[0], cols[1]) - 1 fragment['rfmax'] = max(cols[0], cols[1]) fragment['rlen'] = int(cols[7]) fragment['pctid'] = float(cols[6]) query_fragments.append(fragment) ## don't forget the last one if current_ref_id is not None: if current_ref_id in annot: calculate_gene_coverage_fragments( annot[current_ref_id], query_fragments ) calculate_fragment_coverage( current_ref_id, query_fragments, current_ref_length, ref_cov_stats, refcov_stats_ofh, refext_list_ofh ) if alignment_lines_found == 0: raise Exception("ERROR: failed to find any 11-column alignment lines") else: print("INFO: {0} alignment lines found".format(alignment_lines_found) ) report_gene_coverage_results( annot, genecov_stats_ofh, genesmissing_list_ofh, genecov_tab_ofh ) cov_perc = (ref_cov_stats['n_cov'] / ref_n_total) * 100 cov_perc_id =(ref_cov_stats['n_identical'] / ref_n_total) * 100 refmol_stats_ofh.write("Total bases in reference molecules\t{0}\n".format(ref_n_total) ) refmol_stats_ofh.write("Ref bases covered by query fragments\t{0}\n".format(ref_cov_stats['n_cov']) ) refmol_stats_ofh.write("Ref % covered by query fragments\t{0:.2f}\n".format(cov_perc)) refmol_stats_ofh.write("Ref % identity by query fragments\t{0:.2f}\n".format(cov_perc_id))
def main(): parser = argparse.ArgumentParser(description="Splits FASTA file based on reported coverage gaps") ## output file to be written parser.add_argument("-g", "--gaps_file", type=str, required=True, help="Path to an input gaps file to be read") parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file to be read") parser.add_argument("-o", "--output_file", type=str, required=True, help="Path to an output file to be created") parser.add_argument( "-mfl", "--min_fragment_length", type=int, required=False, help="Min length required for a fragment to be exported", ) parser.add_argument( "-mgl", "--min_gap_length", type=int, required=False, help="Ignore gaps reported under this min length" ) args = parser.parse_args() fasta = utils.fasta_dict_from_file(args.fasta_file) # this is just to keep track of which we've exported molecules_split = dict() ofh = open(args.output_file, "wt") last_molecule_id = None last_end_coordinate = None for line in open(args.gaps_file): cols = line.split("\t") mol_id, start, stop = cols[0], int(cols[1]), int(cols[2]) # skip if this is too short if args.min_gap_length is not None and (stop - start + 1) < args.min_gap_length: # print("DEBUG: skipping short gap {0} : {1}-{2}".format(mol_id, start, stop)) continue if last_molecule_id is None: # first entry, export only beginning of molecule to gap start export_fragment(ofh, fasta, mol_id, 1, start - 1, args.min_fragment_length, molecules_split) last_molecule_id = mol_id last_end_coordinate = stop elif mol_id != last_molecule_id: # new molecule, export end of last molecule last_molecule_length = len(fasta[last_molecule_id]["s"]) export_fragment( ofh, fasta, last_molecule_id, last_end_coordinate + 1, last_molecule_length, args.min_fragment_length, molecules_split, ) # now export the beginning of this one unless the start is 1 if start != 1: export_fragment(ofh, fasta, mol_id, 1, start - 1, args.min_fragment_length, molecules_split) last_molecule_id = mol_id last_end_coordinate = stop else: # same molecule as we just saw, but new entry for it # export end of last gap until beginning of this one export_fragment( ofh, fasta, mol_id, last_end_coordinate + 1, start - 1, args.min_fragment_length, molecules_split ) last_molecule_id = mol_id last_end_coordinate = stop # Now export the full sequences of any which weren't split for id in fasta: if id not in molecules_split: ofh.write(">{0}\n{1}\n".format(id, utils.wrapped_fasta(fasta[id]["s"])))