def get_fastx_entries(fastx_filename, fasta=False, fastq=False): """ Get entries of FASTQ/FASTA file. if fasta=True, read file as fasta regardless of extension. if fastq=True, read file as fastq regardless of extension """ entries = [] fastx_type = get_fastx_type(fastx_filename) if (fastx_type == "fasta") or fasta: # It's a FASTA file entries = fasta_utils.read_fasta(fastx_filename) elif (fastx_type == "fastq") or fastq: # It's a FASTQ file entries = fastq_utils.read_fastq(fastx_filename) return entries
def download_misc_seqs(genome, output_dir): """ Download assorted sequences related to genome. """ # Mapping from sequence label (e.g. rRNA) # to accession numbers organism = None if genome.startswith("hg"): organism = "human" elif genome.startswith("mm"): organism = "mouse" else: print "Error: Unsupported genome." sys.exit(1) # Fetch the accession numbers for the organism's # misc sequences and download them misc_seqs = NCBI_MISC_SEQS[organism] ncbi_outdir = os.path.join(output_dir, "ncbi") misc_outdir = os.path.join(output_dir, "misc") utils.make_dir(ncbi_outdir) utils.make_dir(misc_outdir) for seq_label, access_id in misc_seqs.iteritems(): if access_id is None: continue output_filename = os.path.join(misc_outdir, "%s.fa" %(seq_label)) if os.path.isfile(output_filename): print "%s exists. Skipping download.." %(seq_label) continue print "Downloading: %s (NCBI: %s)" %(seq_label, access_id) url_filename = download_ncbi_fasta(access_id, ncbi_outdir) fasta_in = fasta_utils.read_fasta(url_filename) fasta_out = open(output_filename, "w") print " - Writing to: %s" %(output_filename) # Fetch first FASTA record rec = fasta_in.next() curr_label, fasta_seq = rec # Output it with the required label new_rec = (">%s" %(seq_label), fasta_seq) fasta_utils.write_fasta(fasta_out, [new_rec])
def __init__(self, fasta_fname): self.fasta_fname = fasta_fname self.seqs = fasta_utils.read_fasta(self.fasta_fname)