def output_dinuc_shuffled_fasta(fasta_fname, shuffled_fasta_fname, num_shuffles=1): """ Given a FASTA file, output a dinucleotide shuffled version of it. """ fasta_out = fastx_utils.write_open_fastx(shuffled_fasta_fname) for fastx_entry in fastx_utils.get_fastx_entries(fasta_fname): fastx_name, fastx_seq = fastx_entry shuffled_recs = [] for shuffle_num in range(num_shuffles): shuffled_seq = \ get_dinuc_shuffles(fastx_seq)[0] shuffled_rec = (fastx_name, shuffled_seq) shuffled_recs.append(shuffled_rec) fasta_utils.write_fasta(fasta_out, shuffled_recs) fasta_out.close()
def download_misc_seqs(genome, output_dir): """ Download assorted sequences related to genome. """ # Mapping from sequence label (e.g. rRNA) # to accession numbers organism = None if genome.startswith("hg"): organism = "human" elif genome.startswith("mm"): organism = "mouse" else: print "Error: Unsupported genome." sys.exit(1) # Fetch the accession numbers for the organism's # misc sequences and download them misc_seqs = NCBI_MISC_SEQS[organism] ncbi_outdir = os.path.join(output_dir, "ncbi") misc_outdir = os.path.join(output_dir, "misc") utils.make_dir(ncbi_outdir) utils.make_dir(misc_outdir) for seq_label, access_id in misc_seqs.iteritems(): if access_id is None: continue output_filename = os.path.join(misc_outdir, "%s.fa" %(seq_label)) if os.path.isfile(output_filename): print "%s exists. Skipping download.." %(seq_label) continue print "Downloading: %s (NCBI: %s)" %(seq_label, access_id) url_filename = download_ncbi_fasta(access_id, ncbi_outdir) fasta_in = fasta_utils.read_fasta(url_filename) fasta_out = open(output_filename, "w") print " - Writing to: %s" %(output_filename) # Fetch first FASTA record rec = fasta_in.next() curr_label, fasta_seq = rec # Output it with the required label new_rec = (">%s" %(seq_label), fasta_seq) fasta_utils.write_fasta(fasta_out, [new_rec])