def stream_fa(sequence_file: str) -> Generator[FastA, None, None]: ''' Read a fastq file either gzipped or not and return it as a stream of tuples (Header, Sequence, Quality) :param infile: :return: Generator[FastA, None, None] ''' if sequence_file.endswith('fq.gz') or sequence_file.endswith('fastq.gz'): with gzip.open(sequence_file, 'rt') as handle: for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator( handle): yield FastA(header, sequence) elif sequence_file.endswith('fq') or sequence_file.endswith('fastq'): with open(sequence_file) as handle: for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator( handle): yield FastA(header, sequence) elif sequence_file.endswith('fasta.gz') or sequence_file.endswith('fa.gz'): with gzip.open(sequence_file, 'rt') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield FastA(header, sequence) elif sequence_file.endswith('fasta') or sequence_file.endswith('fa'): with open(sequence_file) as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield FastA(header, sequence) else: raise Exception(f'{sequence_file} not a sequence file.')
def stream_fa(infile): if infile.endswith('fasta.gz') or infile.endswith('fa.gz'): with gzip.open(infile, 'rt') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield (header, sequence) elif infile.endswith('fasta') or infile.endswith('fa'): with open(infile, 'rt') as handle: for (header, sequence) in FastaIO.SimpleFastaParser(handle): yield (header, sequence) else: raise Exception(f'{infile} not a sequence file.')
def main(input, blastndb, output, probe_length, search_step, evalue, blastn_tmpdir, threads): """ Search unique mapped probe(sub-sequence) within a series of sequences stored in a fasta file. \b For example: select 30 candidate probe regions with length 500bp, firstly, ``` $ python uniformly_spaced.py data/hg19.fa ./candidate.fa chr1:89000000-90000000 -n 30 -l 500 ``` then select unique maped probe(sub-sequence) from it. ``` $ python search_uniq.py candidate.fa example/blastn_db/hg19 probe.fa ``` \b Args ---- input : str Path to input fasta file. blastndb : str Path to blastn database. build with `makeblastdb` command. output : str Path to output fasta file. """ with open(input) as f: input_seqs = FastaIO.FastaIterator(f) probes = search_passed_probes(input_seqs, blastndb, evalue, probe_length, search_step, blastn_tmpdir, threads) save_fasta(probes, output)
def first_occurrences_only(fasta_in): in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0] out_basedir = os.path.realpath(os.path.dirname(fasta_in)) out_filepath = os.path.join( out_basedir, in_fasta_basename + "_first_occurrences_only.fasta") total_seq_count = 0 if os.path.exists(out_filepath): raise IOError("%s already exists; skipping..." % out_filepath) with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=None) fasta_out.write_header() for record in SeqIO.parse(fasta_in, "fasta"): total_seq_count += 1 record_hash = hashlib.sha256(str( record.seq).encode("UTF-8")).hexdigest() if record_hash not in hashes_seen_before: hashes_seen_before.add(record_hash) fasta_out.write_record(record) else: pass #print("{} is identical to a sequence earlier in the input file; skipping...".format(record.id)) print("{} seqs seen".format(total_seq_count)) print("{} unique seqs found".format(len(hashes_seen_before))) print("{} identical duplicates removed".format(total_seq_count - len(hashes_seen_before)))
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list, sample_metadata_list): otu_df_list = [] rep_seq_ids = set() seqs = [] # Create OTU table for unhashed_otu_table in unhashed_otu_table_list: otu_df_list.append(hash_otu_table(unhashed_otu_table)) otu_df = pd.concat(otu_df_list, join="outer", axis=1) otu_df.fillna(0.0, inplace=True) otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns)) # Create rep seqs for unhashed_rep_seqs in unhashed_rep_seqs_list: seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids)) otu_table_ids = set(otu_df.index) assert otu_table_ids == rep_seq_ids assert len(otu_df.index) == len(rep_seq_ids) # Merge sample metadata sample_metadata = pd.concat( [pd.read_csv(s, sep="\\t") for s in sample_metadata_list]) # Write files sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False) with biom_open("otu_table.biom", "w") as fid: otu_table.to_hdf5(fid, "Constructed by micone in dada2/deblur pipeline") with open("rep_seqs.fasta", "w") as fid: fasta_writer = FastaIO.FastaWriter(fid, wrap=None) fasta_writer.write_file(seqs)
def load_data(k, stride, pos_fasta, neg_fasta): vocab = Vocabulary(k=k) X = [] n_pos = 0 n_neg = 0 for fasta in pos_fasta, neg_fasta: with open(fasta) as f: for s in tqdm(FastaIO.FastaIterator(f)): seq = str(s.seq) if vocab.unknow_char in seq: continue try: x = vocab.kmer_count(seq, stride) except AssertionError: continue X.append(x) if fasta == pos_fasta: n_pos += 1 else: n_neg += 1 X = np.vstack(X) y = np.hstack([np.ones(n_pos), np.zeros(n_neg)]) return X, y
def remap_tax_id(fasta_in, remap_table): in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0] out_basedir = os.path.realpath(os.path.dirname(fasta_in)) out_filepath = os.path.join(out_basedir,in_fasta_basename+"_annotated_for_beast.fasta") if os.path.exists(out_filepath): raise IOError("%s already exists; skipping..." % out_filepath) id_map = dict() with open(remap_table, "r") as map_handle: for line in map_handle: seqid,*other_fields = line.split("\t") if seqid in id_map: raise LookupError("%s already found in map" % seqid) if seqid=="taxa": # if this is a figtree-formatted annotation file, skip the header row # that has nothing but column labels continue id_map[seqid] = other_fields with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=None) fasta_out.write_header() for record in SeqIO.parse(fasta_in, "fasta"): if record.id in id_map: if sum([len(x) for x in id_map[record.id]])>0: new_description="|".join(id_map[record.id]).replace("||","|?|").replace("||","|?|") record.description=new_description fasta_out.write_record(record) else: print("Warning: '{}' not found in {}".format(record.id,os.path.basename(remap_table)))
def unwrap_fasta(infile, outfile, strip_comment=False): """ This method reads fasta sequences from *infile* and writes them unwrapped in *outfile*. :param str infile: The path to the input FASTA file. :param str outfile: The path to the output file. """ with open(outfile, "w") as fasta_out: if strip_comment: FastaIO.FastaWriter( fasta_out, wrap=None, record2title=Fastq2Fasta.just_name).write_file( SeqIO.parse(infile, 'fasta')) else: FastaIO.FastaWriter(fasta_out, wrap=None).write_file( SeqIO.parse(infile, 'fasta'))
def write_fasta_1line(input_fasta_file, output_fasta_file): with open(input_fasta_file, "r") as handle: record_list = list(SeqIO.parse(handle, "fasta")) print(input_fasta_file) print("Number of protein sequences: ", len(record_list)) with open(output_fasta_file, "w") as handle: fasta_writer = FastaIO.FastaWriter(handle, wrap=None) fasta_writer.write_file(record_list)
def input_text_to_df(input_text): """Converts fasta contents to a df with columns sequence_name and sequence.""" with io.StringIO(initial_value=input_text) as f: fasta_records = list(FastaIO.FastaIterator(f)) fasta_df = pd.DataFrame([(f.name, str(f.seq)) for f in fasta_records], columns=['sequence_name', 'sequence']) return fasta_df
def Main(): parser = argparse.ArgumentParser(description='Generate synthetic reads.', fromfile_prefix_chars='@') parser.add_argument("-t", "--num_transpositions", type=int, default=50, help="Number of transpositions to generate") parser.add_argument("-r", "--num_reads", type=int, default=10000, help="Number of reads to generate per transposition.") parser.add_argument("-l", "--read_length", type=int, default=100, help="Number of reads to generate per transposition.") parser.add_argument("-o", "--output_fname", default='generated_transposition_reads.fa', help="Where to write FASTA output to.") TranspositionParams.AddArgs(parser) args = parser.parse_args() tn_params = TranspositionParams.FromArgs(args) insert_gen = InsertGenerator.FromTranspositionParams(tn_params) n_trans = args.num_transpositions n_reads = args.num_reads read_len = args.read_length print 'Generating %d random transpositions with %d random %d NT reads each' % ( n_trans, n_reads, read_len) print 'Writing generated reads to FASTA' x = 0 with open(args.output_fname, 'w') as fh: writer = FastaIO.FastaWriter(fh) writer.write_header() for construct_num in xrange(n_trans): trans = Transposition(construct_num, insert_gen, tn_params.backbone_seq, tn_params.backbone_start_offset) for read_num in xrange(n_reads): frag = trans.Shear(read_num, read_len) record = frag.ToSeqRecord() writer.write_record(record) x += 1 if x % 1000000 == 0: print "Created %d reads" % x writer.write_footer() print 'Zipping generated reads.' gzip_fname = '%s.gz' % args.output_fname with GzipFile(gzip_fname, mode='w') as gzipf: gzipf.write(args.output_fname)
def export_dna_record(gene_seq, gene_id, gene_description, output_handle): seq_object = Seq(gene_seq, IUPAC.unambiguous_dna) seq_record = SeqRecord(seq_object) seq_record.id = gene_id seq_record.description = gene_description fasta_out = FastaIO.FastaWriter(output_handle, wrap=None) fasta_out.write_header() fasta_out.write_record(seq_record) fasta_out.write_footer()
def openFasta(path): """ open fasta as simple dict (refname is trimmed after the first space)""" from Bio.SeqIO import FastaIO with open(path) as handle: # trim after the first space (as in ref in bam file) return { item[0].split()[0]: item[1] for item in dict(FastaIO.SimpleFastaParser(handle)).items() }
def get_proteins(path): genes = set() with open(path) as handle: for title, seq in fio.SimpleFastaParser(handle): genes.add(title) genes = res.get_unified_names(genes) return genes
def recomp(input, output, both=False): fasta_out = FastaIO.FastaWriter(output, wrap=None) fasta_out.write_header() for seq_record in SeqIO.parse(input, "fasta"): rc_rec = seq_record.reverse_complement(id=seq_record.id + "_RC", description="") if both == True: fasta_out.write_record(seq_record) fasta_out.write_record(rc_rec) fasta_out.write_footer()
def write(data, filename): records = [] with open(filename, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=None) for element in data: sequence = SeqRecord(Seq(element['seq']), id=element['id'], description=element['description']) records.append(sequence) fasta_out.write_file(records)
def get_fasta(pdb_file, fasta_file, transfer_ids=None): fasta_writer = FastaIO.FastaWriter(fasta_file) fasta_writer.write_header() for rec in PdbIO.PdbSeqresIterator(pdb_file): if len(rec.seq) == 0: continue if transfer_ids is not None and rec.id not in transfer_ids: continue print(rec.id, rec.seq, len(rec.seq)) fasta_writer.write_record(rec)
def perform_mapping(mapping_dir, og_files, threshold=1, exclude_species=["none"]): og_dict = {} '''read in og with aa seq''' og = list(SeqIO.parse(og_files, "fasta")) for record in og: key = record.description.split(" | ")[-1] if key in og_dict: ids = [rec.id for rec in og_dict[key]] if record.id not in ids: og_dict[key].append(record) else: og_dict[key] = [] og_dict[key].append(record) # parse the mapped reads to ogs to dictionary all_dict = {} for file in glob.glob(mapping_dir + "*.fa"): og_name = file.split("_")[-1].split(".")[0] og = og_dict[og_name] # change ids to species names for i, record in enumerate(og): s = record.id[0:5] record.id = s # find the best representative seq based by mapping mapping = list(SeqIO.parse(file, "fasta")) best_translated_seq = find_best_translation_by_similarity( mapping, og[0], exclude_species=exclude_species) if best_translated_seq is not None: og.append(best_translated_seq) if get_coverage(og) <= threshold: all_dict[og_name] = og if threshold is not 1: OG_OUT = mapping_dir + 'og' + str(threshold) + "/" elif exclude_species[0] is not "none": OG_OUT = mapping_dir + 'og_without' + "_".join(exclude_species) + "/" else: OG_OUT = mapping_dir + 'og/' if not os.path.exists(OG_OUT): os.makedirs(OG_OUT) for key, item in all_dict.items(): file_name = OG_OUT + key + ".fa" fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None) fasta_out.write_file(item) print("FINISHED OG RECONSTRUCTION!") return all_dict
def hash_rep_seqs(unhashed_rep_seqs, output_file): seqs = list(SeqIO.parse(unhashed_rep_seqs, "fasta")) seq_ids = [] for seq in seqs: seq.id = hash_function(str(seq.seq)) seq_ids.append(seq.id) seq.description = "" seq.name = "" with open(output_file, "w") as fid: fasta_writer = FastaIO.FastaWriter(fid, wrap=None) fasta_writer.write_file(seqs) return seq_ids
def get_seq_from_ids(inputfile=None, outputfile=None, id_file=None): id_list = id_file.readlines() id_list = [x.replace(">", "").rstrip("\n") for x in id_list] record_dict = SeqIO.to_dict(SeqIO.parse(inputfile.name, "fasta")) fasta_out = FastaIO.FastaWriter(outputfile, wrap=None) for i in id_list: print(">" + record_dict[i].id) print(record_dict[i].seq)
def write_fasta_with_sanitized_ids(fasta_in, out_filepath): with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=None) fasta_out.write_header() for record in SeqIO.parse(fasta_in, "fasta"): record.id=sanitize_id_for_sam_rname(record.id) fasta_out.write_record(record) print("out_filepath",out_filepath) print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath)) print("ls -lah") for line in subprocess.check_output(["ls","-lah",os.path.dirname(out_filepath)]).decode("utf-8").split("\n"): print(line) return out_filepath
def MakeConcensusAlignment(pair): write_tmpfasta = "tmp.fasta" handle = open(write_tmpfasta, "w") writer = FastaIO.FastaWriter(handle, wrap=None) writer.write_file(pair) handle1.close while os.path.exists('tmp.fasta') == False: time.sleep(1) command = "mafft " + write_tmpfasta + " > tmp_2.fasta" print(command) subprocess.call(command, shell=True) return ()
def all_sequence_names_from_fasta_file(input_fasta_file_name): """Returns all sequence names from a fasta file. Args: input_fasta_file_name: string. Returns: list of string. """ with tf.io.gfile.GFileText(input_fasta_file_name) as input_file: return [ get_sequence_name_from(protein_name_incl_family) for protein_name_incl_family, _ in FastaIO.SimpleFastaParser(input_file) ]
def single_line_records(fasta_in): in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0] out_basedir = os.path.realpath(os.path.dirname(fasta_in)) out_filepath = os.path.join(out_basedir, in_fasta_basename + "_single_lines.fasta") if os.path.exists(out_filepath): raise IOError("%s already exists; skipping..." % out_filepath) with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=None) fasta_out.write_header() for record in SeqIO.parse(fasta_in, "fasta"): fasta_out.write_record(record)
def _assert_fasta_parsable(input_text): with io.StringIO(initial_value=input_text) as f: fasta_itr = FastaIO.FastaIterator(f) end_iteration_sentinel = object() # Avoid parsing the entire FASTA contents by using `next`. # A malformed FASTA file will have no entries in its FastaIterator. # This is unfortunate (instead of it throwing an error). if next(fasta_itr, end_iteration_sentinel) is end_iteration_sentinel: raise ValueError( 'Failed to parse any input from fasta file. ' 'Consider checking the formatting of your fasta file. ' 'First bit of contents from the fasta file was\n' '{}'.format(input_text.splitlines()[:3]))
def write_records(records, output_file): """ Writes FASTA records (BioPython SeqRecord) to a file. Parameters ---------- records : list List with BioPython SeqRecord objects. output_file : str Path to the output file. """ with open(output_file, 'w') as output_handle: fasta_out = FastaIO.FastaWriter(output_handle, wrap=None) fasta_out.write_file(records)
def convert_multiline_to_single_line_FASTA(): sequences = [] input_handle = open("preads4falcon.fasta", "rU") for record in SeqIO.parse(input_handle, "fasta"): sequences.append(record) global read_len_dict read_len_dict[record.id] = len(record.seq) record_complement = (record.id) + "'" read_len_dict[record_complement] = len(record.seq) output_handle = open("formatted_preads4falcon.fasta", "w") fasta_out = FastaIO.FastaWriter(output_handle, wrap=None) fasta_out.write_file(sequences) output_handle.close()
def main(args): COUNT_SEPARATOR = '_x' seqs = {} for seq_record in SeqIO.parse(args.input_fasta, "fasta"): split_seqid = seq_record.id.split(COUNT_SEPARATOR) if len(split_seqid) == 1: seqs[split_seqid[0]] = {} seqs[split_seqid[0]]['seq'] = seq_record.seq count = 1 seqs[split_seqid[0]]['count'] = count elif len(split_seqid) == 2: seqs[split_seqid[0]] = {} seqs[split_seqid[0]]['seq'] = seq_record.seq count = int(split_seqid[1]) seqs[split_seqid[0]]['count'] = count else: logging.error("Error parsing: ", seq_record.id) # combinations('ABCD', 2) gives: # AB AC AD BC BD CD # ie. we don't need to compare AB and BA for seq1, seq2 in combinations(seqs, 2): # Need to skip over sequences that have been removed on previous iterations if seq1 not in seqs or seq2 not in seqs: continue # Translate each pair of seqs try: seq1_translated = seqs[seq1]['seq'].translate() except TranslationError: print("Error translating: " + seq1, file=sys.stderr) try: seq2_translated = seqs[seq2]['seq'].translate() except TranslationError: print("Error translating: " + seq2, file=sys.stderr) # Remove seq2 from collection of seqs if it translates to the # same amino acid sequence as seq1. Add the counts for seq2 to seq1 before removing. if seq1_translated == seq2_translated: print(seq1, " translates identicaly to ", seq2, ". Deleting ", seq2) seqs[seq1]['count'] += seqs[seq2]['count'] del seqs[seq2] fasta_out = FastaIO.FastaWriter(open('output.fa', 'w'), wrap=None) fasta_out.write_file( (SeqRecord(seqs[seq]['seq'], id=seq + COUNT_SEPARATOR + str(seqs[seq]['count']), description="") for seq in seqs) )
def split_records(fasta_in): for record in SeqIO.parse(fasta_in, "fasta"): in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0] out_basedir = os.path.realpath( os.path.join(os.path.dirname(fasta_in), in_fasta_basename)) if not os.path.isdir(out_basedir): os.makedirs(out_basedir, exist_ok=True) out_filepath = os.path.join(out_basedir, record.id + ".fasta") print("%s %i -> %s" % (record.id, len(record), out_filepath)) if not os.path.exists(out_filepath): with open(out_filepath, "w") as handle: fasta_out = FastaIO.FastaWriter(handle, wrap=None) fasta_out.write_header() fasta_out.write_record(record) #SeqIO.write(record, handle, "fasta") else: #raise IOError("%s already exists; skipping..." % out_filepath) print("%s already exists; skipping..." % out_filepath)
def filter_fasta_file_by_sequence_name(input_fasta_file_name, acceptable_sequence_names): """Yield only entries from a fasta file that are in acceptable_sequence_names. Args: input_fasta_file_name: string. This file should contain fasta entries that are formatted seqName_actualFamily, as above. acceptable_sequence_names: iterable of string. This set just seqName (no actualFamily, as with `input_fasta_file_name`). Yields: strings, each of which is an entry for a fasta file. """ acceptable_sequence_names = set(acceptable_sequence_names) with tf.io.gfile.GFileText(input_fasta_file_name) as input_file: for protein_name, sequence in FastaIO.SimpleFastaParser(input_file): if get_sequence_name_from(protein_name) in acceptable_sequence_names: yield '>' + protein_name + '\n' + sequence + '\n'