Esempio n. 1
0
 def bins_contig_gc(self, inputs, out_path, bin_id_pattern=r"bin\..+\.\d+"):
     array = []
     for bp in glob.glob(inputs):
         bin_id = re.search(bin_id_pattern, os.path.basename(bp)).group()
         for header, seq in iter_fa(bp, trim_line_break=True):
             contig_id = header.split()[0].lstrip(">")
             seq.upper()
             gc = (seq.count('G') + seq.count('C')) / len(seq)
             array.append([contig_id, bin_id, gc])
     df = pd.DataFrame(array, columns=["ContigID", "BinID", "GC"])
     df.to_csv(out_path, sep="\t", index=False)
Esempio n. 2
0
    def parse_bin_fa(self):
        seqs_dict = {}
        kar_ids = []
        lens = []
        for header, seq in iter_fa(bp, trim_line_break=True):
            kar_id = header.split()[0].lstrip(">")
            seqs_dict[kar_id] = seq
            kar_ids.append(kar_id)
            lens.append(len(seq))

        df = pd.DataFrame({"Length": lens}, index=kar_ids)
Esempio n. 3
0
               help="input fasta files.",
               default=False,
               metavar='<path>')
p.add_argument('--minlen',
               '-l',
               help="min contigs length to keep in output",
               default=False,
               metavar='<path>')
p.add_argument('--replaceinput',
               action='store_true',
               help="replace input files")

options = p.parse_args()

i = 1
for ipt in options.inputs:
    with open(ipt + '.renamed', 'w') as fout:
        for header, seq in iter_fa(ipt):
            if options.minlen:
                if len(seq) <= options.minlen:
                    continue

            header = re.sub(r'^\S+', ">contig_" + str(i), header)
            fout.write(header + seq)
            i += 1

if options.replaceinput:
    for ipt in options.inputs:
        os.remove(ipt)
        os.rename(ipt + '.renamed', ipt)
    description="extract sequences from an fasta file with sequence headers")

p.add_argument('-s',
               '--headers',
               required=True,
               help='file containing sequence headers.')
p.add_argument('-f',
               '--fasta',
               required=True,
               help='fasta file containing sequences.')
p.add_argument('-o',
               '--out_file',
               required=True,
               help='file to save the extracted sequences. fasta format.')
options = p.parse_args()

headers_dict = {}
with open(options.headers) as file_handler:
    for line in file_handler:
        headers_dict[line.split()[0].split('_')[1]] = True

sequences_dict = {}

with open(options.out_file, 'w') as fh:
    for header, seq in iter_fa(options.fasta, trim_line_break=True):
        if sequences_dict.get(seq):
            continue
        if headers_dict.get(header.split()[0].split('_')[1]):
            fh.write("{}\n{}\n".format(header, seq))
            sequences_dict[seq] = True
Esempio n. 5
0
#!/usr/bin/env python3.8
from MetaGenome.pyutils.read import iter_fa
import argparse
p = argparse.ArgumentParser(
    description="find subset contigs in union contigs.")

p.add_argument('-s',
               '--subset',
               required=True,
               help='fasta file containing subset contigs')
p.add_argument('-u',
               '--union',
               required=True,
               help='fasta file containing union contigs')
p.add_argument('-o',
               '--out_file',
               required=True,
               help='file to save the found match contig names')

options = p.parse_args()

subset_dict = {}
for header, seq in iter_fa(options.subset, trim_line_break=True):
    subset_dict[seq] = True

with open(options.out_file, 'w') as file_handler:
    for header, seq in iter_fa(options.union, trim_line_break=True):
        if subset_dict.get(seq):
            file_handler.write("{}\n".format(header))