예제 #1
0
def add_gc_content(df, df_sub, args):
    if args.feature == 'gene':
        fasta = {c[0]:c[1] for c in SimpleFastaParser(args.fasta)}
        gene2exon_indx = df.groupby(['gene_id', 'feature'])
        exons = defaultdict(str)
        gene_ids = set(df.gene_id)
        for gene in gene_ids:
            idx = gene2exon_indx.groups[(gene, 'exon')]
            nn = 0
            for ii, row in df.iloc[idx,:].iterrows():
                exon_key = '{}:{}-{}'.format(row['seqname'], row['start']-1, row['end'])
                seq = fasta.get(exon_key)
                exons[gene] += seq
        for gene_id in df_sub.index:
            if gene_id in exons:
                seq = exons[gene_id]
                gc_content = GC(seq)
                df_sub.at[gene_id, 'gc_content'] = gc_content
            else:
                print("missing gene_id in exons dict:")
                print(gene_id)
    elif args.feature == 'transcript':
        tx_ids = set(df['transcript_id'].values)
        for rec in FastaIterator(args.fasta):
            if rec.id in tx_ids:
                gc_content = GC(rec.seq)
                df_sub.loc[rec.id, 'gc_content'] = gc_content
            else:
                print(rec.id)
    else:
        raise ValueError('check feature type!')
    return df_sub
예제 #2
0
 def parse(fasta_file):
     ref_prot_fasta_file = RefProtFastaFile(fasta_file)
     with open(ref_prot_fasta_file.filename) as ff:
         for record in FastaIterator(ff):
             entry = RefProtFastaEntry.parse_fasta_record(record, ref_prot_fasta_file.taxon_id)
             ref_prot_fasta_file.add_entry(entry)
     return ref_prot_fasta_file
예제 #3
0
def align(fh, transl=True):
    """
        Translate and align pangenome cluster fasta file

    """

    align_exe = MuscleCommandline(
        r'C:\Users\matthewwhiteside\workspace\b_ecoli\muscle\muscle3.8.31_i86win32.exe',
        clwstrict=True)

    # Align on stdin/stdout
    proc = subprocess.Popen(str(align_exe),
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE,
                            universal_newlines=True,
                            shell=False)

    sequences = FastaIterator(fh)
    inp = [
        ">" + record.id + "\n" + str(record.translate(table="Bacterial").seq) +
        "\n" for record in sequences
    ]
    inp = "".join(inp)

    align, err = proc.communicate(input=inp)

    return (align)
예제 #4
0
def fasta_reader(filename):
    from Bio.SeqIO.FastaIO import FastaIterator
    input = []
    with open(filename) as handle:
        for record in FastaIterator(handle):
            input += [[str(record.id), str(record.seq)]]
    return input
예제 #5
0
def writeClassifiedFastas(classType,Dirr,resultsDir, df):
    fasta_files_dict = Get_Dirr_All_Fasta (classType,Dirr)
    classDict = {}
    writerDict = {}
    for key, value in fasta_files_dict.items():
        files = {key:value}
        for filename, classname in files.items():
            with open(filename) as fasta:
                for record in FastaIterator(fasta): #SeqIO.SimpleFastaParser(fasta):
                    title = record[0]
                    seq_id = title.split(None, 1)[0]
                    if (record.id in df.index):
                        classname = df[record.id]
                        if (classname not in writerDict):
                            classname = "".join([c for c in classname if c.isalpha() or c.isdigit() or c==' ']).rstrip()
                            file = resultsDir + '\\' + classname + '.fasta'
                            classHandle = open(file, "w")
                            classDict[classname] = classHandle
                            myWriter = FastaWriter(classDict[classname])
                            myWriter.write_header()
                            writerDict[classname] = myWriter
                        writerDict[classname].write_record(record)
    for classname, classHandle in classDict.items():
        writerDict[classname].write_footer()
        classDict[classname].close()
예제 #6
0
def cut_fasta_by_len(fa_file, len_cutoff, outdir, prefix, suffix):
    # https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist
    # Defeats race condition when another thread created the path
    #if not os.path.exists(outdir):
    #    os.mkdir(outdir)
    try:
        os.makedirs(outdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    cut_fa_file = os.path.join(outdir,
                               prefix + ".ge" + str(len_cutoff) + suffix)
    if os.path.exists(cut_fa_file) and (os.path.getsize(cut_fa_file) > 0):
        return cut_fa_file

    if fa_file.endswith(".gz"):
        in_h = gzip.open(fa_file, 'rt')
    else:
        in_h = open(fa_file, 'r')
    with open(cut_fa_file, 'w') as out_h:
        #for rec in SeqIO.parse(in_h, 'fasta'):
        #    if len(rec.seq) >= len_cutoff:
        #        SeqIO.write(rec, out_h, 'fasta')
        # yes, the SeqIO.parse() API is more simple to use, easy to understand
        # but, try different method, you will find something
        writer = FastaWriter(out_h)
        writer.write_header()
        for rec in FastaIterator(in_h):
            if len(rec) >= len_cutoff:
                writer.write_record(rec)
        writer.write_footer()
    in_h.close()
    return cut_fa_file
def filter_influenza_fa(in_fasta, out_fasta, pattern, accession_set):
    '''
    accession_set .. a set of accession IDs that we query
    the fasta header against

    l, count = filter_influenza_fa(path_fa, pattern, include_accession)
    '''
    cache_previous = ()
    count, l = 0, []

    with open(in_fasta) as handle, open(out_fasta, 'a+') as out:
        for record in FastaIterator(handle):
            # [^1]
            if '(' not in list(record.description):
                cache_current = re.search(pattern,
                                          record.description).group(1, 2, 3)
                if cache_current[0] in cache_previous:
                    # [^2]
                    continue
                acc = cache_current[0]
                cache_previous = cache_current
                if acc in accession_set:
                    count += 1
                    l.append(acc)
                    out.write('>' + acc + '\n')
                    out.write(str(record.seq) + '\n')
    return (count)  # l could be returned also
예제 #8
0
파일: __init__.py 프로젝트: biowdl/biotdg
def generate_fake_genome(sample: str,
                         reference: Path,
                         vcf_path: Path,
                         ploidy_dict: Dict[str, int]
                         ) -> Generator[SeqRecord, None, None]:
    """
    Generate a fake genome given a VCF, a reference, and a ploidy dict. A
    fasta record for each chromosome will be created.
    :param sample: The name in the sample of the VCF to use
    :param reference: The reference fasta file to use
    :param vcf_path: The path to the VCF
    :param ploidy_dict: A dictionary containing the ploidies for each contig.
    :return: A Generator that creates the chromosomes one by one.
    """
    mutations_dict = vcf_to_mutations(str(vcf_path), sample)
    with reference.open("rt") as reference_h:
        for seqrecord in FastaIterator(reference_h):
            ploidy = ploidy_dict.get(seqrecord.id, 2)
            for allele_no in range(ploidy):
                # Default to empty list if no mutations were listed.
                mutations = mutations_dict.get(seqrecord.id, {}
                                               ).get(allele_no, [])
                new_sequence = sequence_with_mutations(
                    sequence=str(seqrecord.seq),
                    mutations=mutations)
                new_id = seqrecord.id + "_" + str(allele_no)
                yield SeqRecord(
                    Seq(new_sequence, seqrecord.seq.alphabet),
                    id=new_id,
                    name=new_id,
                    description=new_id)
예제 #9
0
파일: default.py 프로젝트: jannafierst/SIDR
def readFasta(fastaFile):
    """
    Reads a FASTA file and parses contigs for GC content.

    Args:
        fastaFile: The path to the FASTA file.
    Returns:
        contigs A dictionary mapping contigIDs to sidr.common.Contig objects with GC content as a variable.
    """
    contigs = []
    if ".gz" in fastaFile:  # should support .fa.gz files in a seamless (if slow) way
        openFunc = gzip.open
    else:
        openFunc = open
    with openFunc(fastaFile) as data:
        click.echo("Reading %s" % fastaFile)
        with click.progressbar(FastaIterator(data)) as fi:
            for record in fi:  # TODO: conditional formatting
                contigs.append(
                    common.Contig(record.id.split(' ')[0],
                                  variables={"GC": GC(record.seq)}))
    if len(contigs) != len(
            set([x.contigid for x in contigs])
    ):  # exit if duplicate contigs, https://stackoverflow.com/questions/5278122/checking-if-all-elements-in-a-list-are-unique
        raise ValueError("Input FASTA contains duplicate contigIDs, exiting")
    return dict(
        (x.contigid, x) for x in contigs
    )  # https://stackoverflow.com/questions/3070242/reduce-python-list-of-objects-to-dict-object-id-object
예제 #10
0
파일: amptklib.py 프로젝트: irawand07/amptk
def fastarename(input, relabel, output):
    from Bio.SeqIO.FastaIO import FastaIterator
    with open(output, 'w') as outfile:
        counter = 1
        for record in FastaIterator(open(input)):
            newName = relabel + str(counter)
            outfile.write(">%s\n%s\n" % (newName, record.seq))
            counter += 1
예제 #11
0
    def _fasta_reader(filename: str) -> Iterator:
        """
        Read FASTA file content including multifasta format
        """

        with open(filename) as handle:
            for record in FastaIterator(handle):
                yield record
예제 #12
0
def load_files():
    '''Load all files in to an arrary, unshuffled'''
    data = []
    for i, filename in enumerate(FILES):
        with open("data/" + filename) as f:
            filedata = [(values, i) for values in FastaIterator(f)]
            data.extend(filedata)
    return data
예제 #13
0
    def _fasta_reader(filename: str) -> SeqRecord:
        """
        FASTA file reader as iterator
        """

        with open(filename) as handle:
            for record in FastaIterator(handle):
                yield record
예제 #14
0
def main():
    args = argument_parser().parse_args()
    source = Position.from_string(args.source)
    target = Position.from_string(args.target)
    with open(args.fasta, "rt") as fasta_h:
        records = FastaIterator(fasta_h)
        result = mutate(records, source, target)
    print(result.format("fasta"), end='')
def read_single_with_titles(filename, alphabet):
    global title_to_ids
    iterator = FastaIterator(open(filename), alphabet, title_to_ids)
    record = iterator.next()
    try:
        second = iterator.next()
    except StopIteration:
        second = None
    assert record is not None and second is None
    return record
예제 #16
0
def get_base(fasta: str, chromosome: str, start: int, end: Optional[int]):
    if end is None:
        end = start + 1

    with open(fasta, "rt") as fasta_handle:
        records = FastaIterator(fasta_handle)
        for record in records:
            if record.id == chromosome:
                return record[start:end].seq
        # If we have not returned the chromosome was not there.
        raise ValueError(f"{chromosome} not found in {fasta}")
예제 #17
0
 def multi_check(self, filename):
     """Test parsing multi-record FASTA files."""
     msg = f"Test failure parsing file {filename}"
     re_titled = list(FastaIterator(filename, title2ids=title_to_ids))
     default = list(SeqIO.parse(filename, "fasta"))
     self.assertEqual(len(re_titled), len(default), msg=msg)
     for old, new in zip(default, re_titled):
         idn, name, descr = title_to_ids(old.description)
         self.assertEqual(new.id, idn, msg=msg)
         self.assertEqual(new.name, name, msg=msg)
         self.assertEqual(new.description, descr, msg=msg)
         self.assertEqual(new.seq, old.seq, msg=msg)
예제 #18
0
def read_single_with_titles(filename, alphabet):
    global title_to_ids
    handle = open(filename)
    iterator = FastaIterator(handle, alphabet, title_to_ids)
    record = next(iterator)
    try:
        second = next(iterator)
    except StopIteration:
        second = None
    handle.close()
    assert record is not None and second is None
    return record
예제 #19
0
def read_single_with_titles(filename, alphabet):
    """Parser wrapper to confirm single entry FASTA file."""
    global title_to_ids
    with open(filename) as handle:
        iterator = FastaIterator(handle, alphabet, title_to_ids)
        record = next(iterator)
        try:
            second = next(iterator)
        except StopIteration:
            second = None
    assert record is not None and second is None
    return record
예제 #20
0
def fasta_reader(filename):
    """
    Read a multi or single fasta file.
    
    Inputs:
    
        filename - string that represents a name of the file or a path to
                   the file.
    
    Outputs:
    
        A generator object containing a Seq and ID biopython objects.
    """
    if filename.endswith('.gz'):
        with gzip.open(filename, 'rt') as handle:
            for record in FastaIterator(handle):
                yield str(record.id), str(record.seq)
    else:
        with open(filename) as handle:
            for record in FastaIterator(handle):
                yield str(record.id), str(record.seq)
 def multi_check(self, filename, alphabet):
     """Basic test for parsing multi-record FASTA files."""
     re_titled = list(FastaIterator(open(filename), alphabet, title_to_ids))
     default = list(SeqIO.parse(open(filename), "fasta", alphabet))
     self.assertEqual(len(re_titled), len(default))
     for old, new in zip(default, re_titled):
         idn, name, descr = title_to_ids(old.description)
         self.assertEqual(new.id, idn)
         self.assertEqual(new.name, name)
         self.assertEqual(new.description, descr)
         self.assertEqual(str(new.seq), str(old.seq))
         self.assertEqual(new.seq.alphabet, old.seq.alphabet)
예제 #22
0
def read_fasta(inputfile):
	"""Method for loading sequences from a FASTA formatted file and storing them into a list of sequences and names.

	:param inputfile: .fasta file with sequences and headers to read
	:return: lists of sequences and names.
	"""
	names = list()  # list for storing names
	sequences = list()  # list for storing sequences
	with open(inputfile) as handle:
		for record in FastaIterator(handle):  # use biopythons SeqIO module
			names.append(record.description)
			sequences.append(str(record.seq))
	return sequences, names
예제 #23
0
 def multi_check(self, filename, alphabet):
     """Test parsing multi-record FASTA files."""
     msg = "Test failure parsing file %s" % filename
     re_titled = list(FastaIterator(filename, alphabet, title_to_ids))
     default = list(SeqIO.parse(filename, "fasta", alphabet))
     self.assertEqual(len(re_titled), len(default), msg=msg)
     for old, new in zip(default, re_titled):
         idn, name, descr = title_to_ids(old.description)
         self.assertEqual(new.id, idn, msg=msg)
         self.assertEqual(new.name, name, msg=msg)
         self.assertEqual(new.description, descr, msg=msg)
         self.assertEqual(str(new.seq), str(old.seq), msg=msg)
         self.assertEqual(new.seq.alphabet, old.seq.alphabet, msg=msg)
예제 #24
0
 def parse_file(file_path):
 records_letters = {}
 with open(file_path) as in_handle:
     for record in FastaIterator(in_handle):
         records_letters[record.id] = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'Y': 0, 'M': 0, 'S': 0, 'R': 0, 'W': 0,
                                       'K': 0, 'N': 0, 'D': 0, 'B': 0, 'H': 0, 'V': 0, 'all': 0, 'a': 0, 'c': 0, 'g': 0, 't': 0, 'y': 0, 'm': 0, 's': 0, 'r': 0, 'w': 0,
                                       'k': 0, 'n': 0, 'd': 0, 'b': 0, 'h': 0, 'v': 0, 'all_small': 0, 'all_big': 0}
         for letter in record.seq:
             if letter.islower():
               records_letters[record.id]['all_small'] += 1
             else:
               records_letters[record.id]['all_big'] += 1
             records_letters[record.id][letter] += 1
             records_letters[record.id]['all'] += 1
 return records_letters
예제 #25
0
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz):
    if in_gz:
        in_h = gzip.open(fa_in, 'rt')
    else:
        in_h = open(fa_in, 'r')
    if gz:
        out_h = bgzf.BgzfWriter(fa_out, 'wb')
    else:
        out_h = open(fa_out, 'w')
    writer = FastaWriter(out_h)
    writer.write_header()
    for rec in FastaIterator(in_h, title2ids=header_function):
        writer.write_record(rec)
    writer.write_footer()
    out_h.close()
    in_h.close()
예제 #26
0
파일: random.py 프로젝트: kako-f/Generator
    def create_rs(self, file):

        newpath = Cf().create_file_folder(file=file)
        filename, extension = os.path.splitext(os.path.basename(file))
        random_genome_file = os.path.join(newpath, os.path.normpath(os.path.join(filename + '_random' + extension)))
        with open(file, 'rU') as GenomeFile:
            with open(random_genome_file, 'w') as RgFile:
                for record in FastaIterator(handle=GenomeFile):
                    print('Creating random record for: ' + record.id)
                    created_random_seq = self.generate_rs(str(record.seq))
                    random_record = SeqRecord(BioPythonSeq(created_random_seq),
                                              id=record.id + '_random_',
                                              name=record.name + '_random_',
                                              description=record.description + '_random_')
                    SeqIO.write(random_record, RgFile, 'fasta')
            RgFile.close()

        return random_genome_file
예제 #27
0
    def parse_file(self, file_path):
        data = {}
        print("Analysing: " + file_path)
        with open(file_path) as file:
            for record in FastaIterator(file):
                data[record.id] = {}
                start_index = 0
                end_index = len(record.seq) - 1

                while start_index + self.window_size < end_index:
                    data[record.id][start_index] = self.parse_sequence(
                        record.seq[start_index:(start_index +
                                                self.window_size)])
                    start_index += self.window_size

                data[record.id][start_index] = self.parse_sequence(
                    record.seq[start_index:end_index])
            return data
예제 #28
0
 def simple_check(self, filename):
     """Test parsing single record FASTA files."""
     msg = f"Test failure parsing file {filename}"
     title, seq = read_title_and_seq(filename)  # crude parser
     idn, name, descr = title_to_ids(title)
     # First check using Bio.SeqIO.FastaIO directly with title function.
     records = FastaIterator(filename, title2ids=title_to_ids)
     record = next(records)
     with self.assertRaises(StopIteration):
         next(records)
     self.assertEqual(record.id, idn, msg=msg)
     self.assertEqual(record.name, name, msg=msg)
     self.assertEqual(record.description, descr, msg=msg)
     self.assertEqual(record.seq, seq, msg=msg)
     # Now check using Bio.SeqIO (default settings)
     record = SeqIO.read(filename, "fasta")
     self.assertEqual(record.id, title.split()[0], msg=msg)
     self.assertEqual(record.name, title.split()[0], msg=msg)
     self.assertEqual(record.description, title, msg=msg)
     self.assertEqual(record.seq, seq, msg=msg)
예제 #29
0
def FindGene(PATRICID, Header):

    OUT = dict()

    SPGENE = pd.read_csv('/pylon5/br5phhp/tv349/AMR/PATRIC/SPGENE/' +
                         PATRICID + '.PATRIC.spgene.tab',
                         sep='\t')
    LocalPos = SPGENE.index[SPGENE['patric_id'] == Header].tolist()
    # if the sequence exists here:
    OUTSPGENE = dict()
    if len(LocalPos) == 1:
        OUTSPGENE = (SPGENE.loc[LocalPos,
                                ['gene', 'product', 'property', 'function']]
                     ).to_dict('records')[0]

    FEATURES = pd.read_csv('/pylon5/br5phhp/tv349/AMR/PATRIC/FEATURES/' +
                           PATRICID + '.PATRIC.features.tab',
                           sep='\t')
    LocalPos = FEATURES.index[FEATURES['patric_id'] == Header].tolist()
    OUTFEATURES = dict()
    if len(LocalPos) == 1:
        OUTFEATURES = (FEATURES.loc[LocalPos,
                                    ['gene', 'product']]).to_dict('records')[0]

    OUT = {**OUTFEATURES, **OUTSPGENE}

    # Get sequence
    with open("/pylon5/br5phhp/tv349/AMR/PATRIC/PROTEIN/" + PATRICID +
              ".PATRIC.faa") as handle:
        for record in FastaIterator(handle):
            if record.id == Header:
                AAseq = str(record.seq)

    OUT['translation'] = AAseq

    return OUT
예제 #30
0
FNULL = open(os.devnull, 'w')
pid = os.getpid()
#reverse complement rev primer
ForPrimer = args.fwdprimer
RevPrimer = revcomp_lib.RevComp(args.revprimer)

print 'Loading ' + '{0:,}'.format(amptklib.countfasta(
    args.input)) + ' sequence records'
print 'Searching for forward primer: %s, and reverse primer: %s' % (ForPrimer,
                                                                    RevPrimer)
print 'Requiring reverse primer match with at least %i mismatches' % args.primer_mismatch
#loop through seqs, remove primer if found, and truncate to length
truncated = 'bold2amptk_' + str(pid) + '.truncate.tmp'
with open(truncated, 'w') as output:
    for record in FastaIterator(open(args.input)):
        Seq = str(record.seq)
        StripSeq = ''
        ForCutPos = amptklib.findFwdPrimer(ForPrimer, Seq,
                                           args.primer_mismatch,
                                           amptklib.degenNucSimple)
        RevCutPos = amptklib.findRevPrimer(RevPrimer, Seq,
                                           args.primer_mismatch,
                                           amptklib.degenNucSimple)
        if ForCutPos and RevCutPos:
            StripSeq = Seq[ForCutPos:RevCutPos]
        elif not ForCutPos and RevCutPos:
            StripSeq = Seq[:RevCutPos]
        if len(StripSeq) >= args.minlen:
            output.write('>%s\n%s\n' % (record.description, StripSeq))