def stream_fa(sequence_file: str) -> Generator[FastA, None, None]:
    '''
    Read a fastq file either gzipped or not and return it as a stream of tuples
    (Header, Sequence, Quality)
    :param infile:
    :return: Generator[FastA, None, None]
    '''

    if sequence_file.endswith('fq.gz') or sequence_file.endswith('fastq.gz'):
        with gzip.open(sequence_file, 'rt') as handle:
            for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator(
                    handle):
                yield FastA(header, sequence)
    elif sequence_file.endswith('fq') or sequence_file.endswith('fastq'):
        with open(sequence_file) as handle:
            for header, sequence, qual in Bio.SeqIO.QualityIO.FastqGeneralIterator(
                    handle):
                yield FastA(header, sequence)
    elif sequence_file.endswith('fasta.gz') or sequence_file.endswith('fa.gz'):
        with gzip.open(sequence_file, 'rt') as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield FastA(header, sequence)
    elif sequence_file.endswith('fasta') or sequence_file.endswith('fa'):
        with open(sequence_file) as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield FastA(header, sequence)
    else:
        raise Exception(f'{sequence_file} not a sequence file.')
def stream_fa(infile):
    if infile.endswith('fasta.gz') or infile.endswith('fa.gz'):
        with gzip.open(infile, 'rt') as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield (header, sequence)
    elif infile.endswith('fasta') or infile.endswith('fa'):
        with open(infile, 'rt') as handle:
            for (header, sequence) in FastaIO.SimpleFastaParser(handle):
                yield (header, sequence)
    else:
        raise Exception(f'{infile} not a sequence file.')
예제 #3
0
파일: search_uniq.py 프로젝트: Nanguage/UBW
def main(input, blastndb, output, probe_length, search_step, evalue,
         blastn_tmpdir, threads):
    """
    Search unique mapped probe(sub-sequence)
    within a series of sequences stored in a fasta file.

    \b
    For example:
    select 30 candidate probe regions with length 500bp, firstly,
    ```
    $ python uniformly_spaced.py data/hg19.fa ./candidate.fa chr1:89000000-90000000 -n 30 -l 500
    ```
    then select unique maped probe(sub-sequence) from it.
    ```
    $ python search_uniq.py candidate.fa example/blastn_db/hg19 probe.fa
    ```

    \b
    Args
    ----
    input : str
        Path to input fasta file.
    blastndb : str
        Path to blastn database.
        build with `makeblastdb` command.
    output : str
        Path to output fasta file.
    
    """
    with open(input) as f:
        input_seqs = FastaIO.FastaIterator(f)
        probes = search_passed_probes(input_seqs, blastndb, evalue,
                                      probe_length, search_step, blastn_tmpdir,
                                      threads)
        save_fasta(probes, output)
def first_occurrences_only(fasta_in):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(
        out_basedir, in_fasta_basename + "_first_occurrences_only.fasta")

    total_seq_count = 0

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            total_seq_count += 1
            record_hash = hashlib.sha256(str(
                record.seq).encode("UTF-8")).hexdigest()
            if record_hash not in hashes_seen_before:
                hashes_seen_before.add(record_hash)
                fasta_out.write_record(record)
            else:
                pass
                #print("{} is identical to a sequence earlier in the input file; skipping...".format(record.id))

    print("{} seqs seen".format(total_seq_count))
    print("{} unique seqs found".format(len(hashes_seen_before)))
    print("{} identical duplicates removed".format(total_seq_count -
                                                   len(hashes_seen_before)))
예제 #5
0
def hashing(unhashed_otu_table_list, unhashed_rep_seqs_list,
            sample_metadata_list):
    otu_df_list = []
    rep_seq_ids = set()
    seqs = []
    # Create OTU table
    for unhashed_otu_table in unhashed_otu_table_list:
        otu_df_list.append(hash_otu_table(unhashed_otu_table))
    otu_df = pd.concat(otu_df_list, join="outer", axis=1)
    otu_df.fillna(0.0, inplace=True)
    otu_table = Table(otu_df.values, list(otu_df.index), list(otu_df.columns))
    # Create rep seqs
    for unhashed_rep_seqs in unhashed_rep_seqs_list:
        seqs.extend(hash_rep_seqs(unhashed_rep_seqs, rep_seq_ids))
    otu_table_ids = set(otu_df.index)
    assert otu_table_ids == rep_seq_ids
    assert len(otu_df.index) == len(rep_seq_ids)
    # Merge sample metadata
    sample_metadata = pd.concat(
        [pd.read_csv(s, sep="\\t") for s in sample_metadata_list])
    # Write files
    sample_metadata.to_csv("sample_metadata.tsv", sep="\\t", index=False)
    with biom_open("otu_table.biom", "w") as fid:
        otu_table.to_hdf5(fid,
                          "Constructed by micone in dada2/deblur pipeline")
    with open("rep_seqs.fasta", "w") as fid:
        fasta_writer = FastaIO.FastaWriter(fid, wrap=None)
        fasta_writer.write_file(seqs)
예제 #6
0
def load_data(k, stride, pos_fasta, neg_fasta):
    vocab = Vocabulary(k=k)

    X = []
    n_pos = 0
    n_neg = 0
    for fasta in pos_fasta, neg_fasta:
        with open(fasta) as f:
            for s in tqdm(FastaIO.FastaIterator(f)):
                seq = str(s.seq)
                if vocab.unknow_char in seq:
                    continue
                try:
                    x = vocab.kmer_count(seq, stride)
                except AssertionError:
                    continue
                X.append(x)
                if fasta == pos_fasta:
                    n_pos += 1
                else:
                    n_neg += 1

    X = np.vstack(X)
    y = np.hstack([np.ones(n_pos), np.zeros(n_neg)])
    return X, y
def remap_tax_id(fasta_in, remap_table):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(out_basedir,in_fasta_basename+"_annotated_for_beast.fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    id_map = dict()
    with open(remap_table, "r") as map_handle:
        for line in map_handle:
            seqid,*other_fields = line.split("\t")
            if seqid in id_map:
                raise LookupError("%s already found in map" % seqid)
            if seqid=="taxa":
                # if this is a figtree-formatted annotation file, skip the header row
                # that has nothing but column labels
                continue
            id_map[seqid] = other_fields

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            if record.id in id_map:
                if sum([len(x) for x in id_map[record.id]])>0:
                    new_description="|".join(id_map[record.id]).replace("||","|?|").replace("||","|?|")
                    record.description=new_description
                    fasta_out.write_record(record)
            else:
                print("Warning: '{}' not found in {}".format(record.id,os.path.basename(remap_table)))
예제 #8
0
 def unwrap_fasta(infile, outfile, strip_comment=False):
     """
     This method reads fasta sequences from *infile*
     and writes them unwrapped in *outfile*.
     :param str infile: The path to the input FASTA file.
     :param str outfile: The path to the output file.
     """
     with open(outfile, "w") as fasta_out:
         if strip_comment:
             FastaIO.FastaWriter(
                 fasta_out, wrap=None,
                 record2title=Fastq2Fasta.just_name).write_file(
                     SeqIO.parse(infile, 'fasta'))
         else:
             FastaIO.FastaWriter(fasta_out, wrap=None).write_file(
                 SeqIO.parse(infile, 'fasta'))
예제 #9
0
def write_fasta_1line(input_fasta_file, output_fasta_file):
    with open(input_fasta_file, "r") as handle:
        record_list = list(SeqIO.parse(handle, "fasta"))
        print(input_fasta_file)
        print("Number of protein sequences: ", len(record_list))
    with open(output_fasta_file, "w") as handle:
        fasta_writer = FastaIO.FastaWriter(handle, wrap=None)
        fasta_writer.write_file(record_list)
예제 #10
0
def input_text_to_df(input_text):
    """Converts fasta contents to a df with columns sequence_name and sequence."""
    with io.StringIO(initial_value=input_text) as f:
        fasta_records = list(FastaIO.FastaIterator(f))
        fasta_df = pd.DataFrame([(f.name, str(f.seq)) for f in fasta_records],
                                columns=['sequence_name', 'sequence'])

    return fasta_df
def Main():
    parser = argparse.ArgumentParser(description='Generate synthetic reads.',
                                     fromfile_prefix_chars='@')
    parser.add_argument("-t",
                        "--num_transpositions",
                        type=int,
                        default=50,
                        help="Number of transpositions to generate")
    parser.add_argument("-r",
                        "--num_reads",
                        type=int,
                        default=10000,
                        help="Number of reads to generate per transposition.")
    parser.add_argument("-l",
                        "--read_length",
                        type=int,
                        default=100,
                        help="Number of reads to generate per transposition.")
    parser.add_argument("-o",
                        "--output_fname",
                        default='generated_transposition_reads.fa',
                        help="Where to write FASTA output to.")
    TranspositionParams.AddArgs(parser)
    args = parser.parse_args()

    tn_params = TranspositionParams.FromArgs(args)
    insert_gen = InsertGenerator.FromTranspositionParams(tn_params)

    n_trans = args.num_transpositions
    n_reads = args.num_reads
    read_len = args.read_length
    print 'Generating %d random transpositions with %d random %d NT reads each' % (
        n_trans, n_reads, read_len)

    print 'Writing generated reads to FASTA'
    x = 0
    with open(args.output_fname, 'w') as fh:
        writer = FastaIO.FastaWriter(fh)
        writer.write_header()
        for construct_num in xrange(n_trans):
            trans = Transposition(construct_num, insert_gen,
                                  tn_params.backbone_seq,
                                  tn_params.backbone_start_offset)
            for read_num in xrange(n_reads):
                frag = trans.Shear(read_num, read_len)
                record = frag.ToSeqRecord()
                writer.write_record(record)

                x += 1
                if x % 1000000 == 0:
                    print "Created %d reads" % x

        writer.write_footer()

    print 'Zipping generated reads.'
    gzip_fname = '%s.gz' % args.output_fname
    with GzipFile(gzip_fname, mode='w') as gzipf:
        gzipf.write(args.output_fname)
def export_dna_record(gene_seq, gene_id, gene_description, output_handle):
    seq_object = Seq(gene_seq, IUPAC.unambiguous_dna)
    seq_record = SeqRecord(seq_object)
    seq_record.id = gene_id
    seq_record.description = gene_description
    fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
    fasta_out.write_header()
    fasta_out.write_record(seq_record)
    fasta_out.write_footer()
예제 #13
0
def openFasta(path):
    """ open fasta as simple dict (refname is trimmed after the first space)"""
    from Bio.SeqIO import FastaIO
    with open(path) as handle:
        # trim after the first space (as in ref in bam file)
        return {
            item[0].split()[0]: item[1]
            for item in dict(FastaIO.SimpleFastaParser(handle)).items()
        }
예제 #14
0
def get_proteins(path):

    genes = set()
    with open(path) as handle:
        for title, seq in fio.SimpleFastaParser(handle):

            genes.add(title)

    genes = res.get_unified_names(genes)
    return genes
예제 #15
0
def recomp(input, output, both=False):
    fasta_out = FastaIO.FastaWriter(output, wrap=None)
    fasta_out.write_header()
    for seq_record in SeqIO.parse(input, "fasta"):
        rc_rec = seq_record.reverse_complement(id=seq_record.id + "_RC",
                                               description="")
        if both == True:
            fasta_out.write_record(seq_record)
        fasta_out.write_record(rc_rec)
    fasta_out.write_footer()
def write(data, filename):
    records = []
    with open(filename, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        for element in data:
            sequence = SeqRecord(Seq(element['seq']), 
                                    id=element['id'], 
                                    description=element['description'])
            records.append(sequence)
        fasta_out.write_file(records)
예제 #17
0
def get_fasta(pdb_file, fasta_file, transfer_ids=None):
    fasta_writer = FastaIO.FastaWriter(fasta_file)
    fasta_writer.write_header()
    for rec in PdbIO.PdbSeqresIterator(pdb_file):
        if len(rec.seq) == 0:
            continue
        if transfer_ids is not None and rec.id not in transfer_ids:
            continue
        print(rec.id, rec.seq, len(rec.seq))
        fasta_writer.write_record(rec)
예제 #18
0
def perform_mapping(mapping_dir,
                    og_files,
                    threshold=1,
                    exclude_species=["none"]):
    og_dict = {}
    '''read in og with aa seq'''
    og = list(SeqIO.parse(og_files, "fasta"))
    for record in og:
        key = record.description.split(" | ")[-1]
        if key in og_dict:
            ids = [rec.id for rec in og_dict[key]]
            if record.id not in ids:
                og_dict[key].append(record)
        else:
            og_dict[key] = []
            og_dict[key].append(record)

    # parse the mapped reads to ogs to dictionary
    all_dict = {}
    for file in glob.glob(mapping_dir + "*.fa"):
        og_name = file.split("_")[-1].split(".")[0]
        og = og_dict[og_name]

        # change ids to species names
        for i, record in enumerate(og):
            s = record.id[0:5]
            record.id = s

        # find the best representative seq based by mapping
        mapping = list(SeqIO.parse(file, "fasta"))
        best_translated_seq = find_best_translation_by_similarity(
            mapping, og[0], exclude_species=exclude_species)
        if best_translated_seq is not None:
            og.append(best_translated_seq)
            if get_coverage(og) <= threshold:
                all_dict[og_name] = og

    if threshold is not 1:
        OG_OUT = mapping_dir + 'og' + str(threshold) + "/"
    elif exclude_species[0] is not "none":
        OG_OUT = mapping_dir + 'og_without' + "_".join(exclude_species) + "/"
    else:
        OG_OUT = mapping_dir + 'og/'

    if not os.path.exists(OG_OUT):
        os.makedirs(OG_OUT)

    for key, item in all_dict.items():
        file_name = OG_OUT + key + ".fa"
        fasta_out = FastaIO.FastaWriter(open(file_name, "w"), wrap=None)
        fasta_out.write_file(item)

    print("FINISHED OG RECONSTRUCTION!")
    return all_dict
예제 #19
0
파일: hashing.py 프로젝트: Vikash84/MiCoNE
def hash_rep_seqs(unhashed_rep_seqs, output_file):
    seqs = list(SeqIO.parse(unhashed_rep_seqs, "fasta"))
    seq_ids = []
    for seq in seqs:
        seq.id = hash_function(str(seq.seq))
        seq_ids.append(seq.id)
        seq.description = ""
        seq.name = ""
    with open(output_file, "w") as fid:
        fasta_writer = FastaIO.FastaWriter(fid, wrap=None)
        fasta_writer.write_file(seqs)
    return seq_ids
예제 #20
0
def get_seq_from_ids(inputfile=None, outputfile=None, id_file=None):

    id_list = id_file.readlines()
    id_list = [x.replace(">", "").rstrip("\n") for x in id_list]

    record_dict = SeqIO.to_dict(SeqIO.parse(inputfile.name, "fasta"))

    fasta_out = FastaIO.FastaWriter(outputfile, wrap=None)

    for i in id_list:
        print(">" + record_dict[i].id)
        print(record_dict[i].seq)
예제 #21
0
def write_fasta_with_sanitized_ids(fasta_in, out_filepath):
    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            record.id=sanitize_id_for_sam_rname(record.id)
            fasta_out.write_record(record)
    print("out_filepath",out_filepath)
    print("os.path.dirname(out_filepath)",os.path.dirname(out_filepath))
    print("ls -lah")
    for line in subprocess.check_output(["ls","-lah",os.path.dirname(out_filepath)]).decode("utf-8").split("\n"):
        print(line)
    return out_filepath
def MakeConcensusAlignment(pair):
    write_tmpfasta = "tmp.fasta"
    handle = open(write_tmpfasta, "w")
    writer = FastaIO.FastaWriter(handle, wrap=None)
    writer.write_file(pair)
    handle1.close

    while os.path.exists('tmp.fasta') == False:
        time.sleep(1)
    command = "mafft " + write_tmpfasta + " > tmp_2.fasta"
    print(command)
    subprocess.call(command, shell=True)

    return ()
예제 #23
0
def all_sequence_names_from_fasta_file(input_fasta_file_name):
  """Returns all sequence names from a fasta file.

  Args:
    input_fasta_file_name: string.

  Returns:
    list of string.
  """
  with tf.io.gfile.GFileText(input_fasta_file_name) as input_file:
    return [
        get_sequence_name_from(protein_name_incl_family)
        for protein_name_incl_family, _ in FastaIO.SimpleFastaParser(input_file)
    ]
예제 #24
0
def single_line_records(fasta_in):
    in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
    out_basedir = os.path.realpath(os.path.dirname(fasta_in))
    out_filepath = os.path.join(out_basedir,
                                in_fasta_basename + "_single_lines.fasta")

    if os.path.exists(out_filepath):
        raise IOError("%s already exists; skipping..." % out_filepath)

    with open(out_filepath, "w") as handle:
        fasta_out = FastaIO.FastaWriter(handle, wrap=None)
        fasta_out.write_header()
        for record in SeqIO.parse(fasta_in, "fasta"):
            fasta_out.write_record(record)
예제 #25
0
def _assert_fasta_parsable(input_text):
    with io.StringIO(initial_value=input_text) as f:
        fasta_itr = FastaIO.FastaIterator(f)
        end_iteration_sentinel = object()

        # Avoid parsing the entire FASTA contents by using `next`.
        # A malformed FASTA file will have no entries in its FastaIterator.
        # This is unfortunate (instead of it throwing an error).
        if next(fasta_itr, end_iteration_sentinel) is end_iteration_sentinel:
            raise ValueError(
                'Failed to parse any input from fasta file. '
                'Consider checking the formatting of your fasta file. '
                'First bit of contents from the fasta file was\n'
                '{}'.format(input_text.splitlines()[:3]))
예제 #26
0
def write_records(records, output_file):
    """ Writes FASTA records (BioPython SeqRecord) to a file.

        Parameters
        ----------
        records : list
            List with BioPython SeqRecord objects.
        output_file : str
            Path to the output file.
    """

    with open(output_file, 'w') as output_handle:
        fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
        fasta_out.write_file(records)
예제 #27
0
def convert_multiline_to_single_line_FASTA():
    sequences = []
    input_handle = open("preads4falcon.fasta", "rU")

    for record in SeqIO.parse(input_handle, "fasta"):
        sequences.append(record)
        global read_len_dict
        read_len_dict[record.id] = len(record.seq)
        record_complement = (record.id) + "'"
        read_len_dict[record_complement] = len(record.seq)
    output_handle = open("formatted_preads4falcon.fasta", "w")
    fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
    fasta_out.write_file(sequences)
    output_handle.close()
예제 #28
0
def main(args):
    COUNT_SEPARATOR = '_x'
    seqs = {}

    for seq_record in SeqIO.parse(args.input_fasta, "fasta"):
        split_seqid = seq_record.id.split(COUNT_SEPARATOR)
        if len(split_seqid) == 1:
            seqs[split_seqid[0]] = {}
            seqs[split_seqid[0]]['seq'] = seq_record.seq
            count = 1
            seqs[split_seqid[0]]['count'] = count
        elif len(split_seqid) == 2:
            seqs[split_seqid[0]] = {}
            seqs[split_seqid[0]]['seq'] = seq_record.seq
            count = int(split_seqid[1])
            seqs[split_seqid[0]]['count'] = count
        else:
            logging.error("Error parsing: ", seq_record.id)

    # combinations('ABCD', 2) gives:
    # AB AC AD BC BD CD
    # ie. we don't need to compare AB and BA
    for seq1, seq2 in combinations(seqs, 2):
        # Need to skip over sequences that have been removed on previous iterations
        if seq1 not in seqs or seq2 not in seqs:
            continue

        # Translate each pair of seqs
        try:
            seq1_translated = seqs[seq1]['seq'].translate()
        except TranslationError:
            print("Error translating: " + seq1, file=sys.stderr)
        try:
            seq2_translated = seqs[seq2]['seq'].translate()
        except TranslationError:
            print("Error translating: " + seq2, file=sys.stderr)
    
        # Remove seq2 from collection of seqs if it translates to the
        # same amino acid sequence as seq1. Add the counts for seq2 to seq1 before removing.
        if seq1_translated == seq2_translated:
            print(seq1, " translates identicaly to ", seq2, ". Deleting ", seq2)
            seqs[seq1]['count'] += seqs[seq2]['count']
            del seqs[seq2]

    fasta_out = FastaIO.FastaWriter(open('output.fa', 'w'), wrap=None)
    fasta_out.write_file(
        (SeqRecord(seqs[seq]['seq'], id=seq + COUNT_SEPARATOR + str(seqs[seq]['count']), description="") for seq in seqs)
    )
def split_records(fasta_in):
    for record in SeqIO.parse(fasta_in, "fasta"):
        in_fasta_basename = os.path.splitext(os.path.basename(fasta_in))[0]
        out_basedir = os.path.realpath(
            os.path.join(os.path.dirname(fasta_in), in_fasta_basename))
        if not os.path.isdir(out_basedir):
            os.makedirs(out_basedir, exist_ok=True)
        out_filepath = os.path.join(out_basedir, record.id + ".fasta")
        print("%s %i -> %s" % (record.id, len(record), out_filepath))
        if not os.path.exists(out_filepath):
            with open(out_filepath, "w") as handle:
                fasta_out = FastaIO.FastaWriter(handle, wrap=None)
                fasta_out.write_header()
                fasta_out.write_record(record)
                #SeqIO.write(record, handle, "fasta")
        else:
            #raise IOError("%s already exists; skipping..." % out_filepath)
            print("%s already exists; skipping..." % out_filepath)
예제 #30
0
def filter_fasta_file_by_sequence_name(input_fasta_file_name,
                                       acceptable_sequence_names):
  """Yield only entries from a fasta file that are in acceptable_sequence_names.

  Args:
    input_fasta_file_name: string. This file should contain fasta entries that
      are formatted seqName_actualFamily, as above.
    acceptable_sequence_names: iterable of string. This set just seqName (no
      actualFamily, as with `input_fasta_file_name`).

  Yields:
    strings, each of which is an entry for a fasta file.
  """
  acceptable_sequence_names = set(acceptable_sequence_names)
  with tf.io.gfile.GFileText(input_fasta_file_name) as input_file:
    for protein_name, sequence in FastaIO.SimpleFastaParser(input_file):
      if get_sequence_name_from(protein_name) in acceptable_sequence_names:
        yield '>' + protein_name + '\n' + sequence + '\n'