예제 #1
0
def load_ids(filename, filetype):
    if filetype == "tabular":
        for line in open(filename):
            line = line.rstrip("\n")
            if line and not line.startswith("#"):
                yield line.split("\t", 1)[0]
    elif filetype == "fasta":
        for line in open(filename):
            if line.startswith(">"):
                yield line[1:].rstrip("\n").split(None, 1)[0]
    elif filetype.startswith("fastq"):
        # Use the Galaxy library not Biopython to cope with CS
        from galaxy_utils.sequence.fastq import fastqReader
        handle = open(filename, "rU")
        for record in fastqReader(handle):
            # The [1:] is because the fastaReader leaves the @ on the identifer.
            yield record.identifier.split()[0][1:]
        handle.close()
    elif filetype == "sff":
        try:
            from Bio.SeqIO import index
        except ImportError:
            sys.exit("Require Biopython 1.54 or later (to read SFF files)")
        # This will read the SFF index block if present (very fast)
        for name in index(filename, "sff"):
            yield name
    else:
        sys.exit("Unexpected file type %s" % filetype)
예제 #2
0
def load_ids(filename, filetype):
    if filetype == "tabular":
        for line in open(filename):
            line = line.rstrip("\n")
            if line and not line.startswith("#"):
                yield line.split("\t", 1)[0]
    elif filetype == "fasta":
        for line in open(filename):
            if line.startswith(">"):
                yield line[1:].rstrip("\n").split(None, 1)[0]
    elif filetype.startswith("fastq"):
        # Use the Galaxy library not Biopython to cope with CS
        from galaxy_utils.sequence.fastq import fastqReader
        handle = open(filename, "rU")
        for record in fastqReader(handle):
            # The [1:] is because the fastaReader leaves the @ on the identifer.
            yield record.identifier.split()[0][1:]
        handle.close()
    elif filetype == "sff":
        try:
            from Bio.SeqIO import index
        except ImportError:
            sys.exit("Require Biopython 1.54 or later (to read SFF files)")
        # This will read the SFF index block if present (very fast)
        for name in index(filename, "sff"):
            yield name
    else:
        sys.exit("Unexpected file type %s" % filetype)
예제 #3
0
def main():
    tree = read(sys.argv[1], 'newick')
    seqs = index(sys.argv[2], 'fasta')
    if not tree.rooted:
        tree.root_at_midpoint()
    tree.ladderize(reverse=True)
    for leaf in tree.get_terminals():
        write(seqs[leaf.name], sys.stdout, 'fasta')
예제 #4
0
hmmResultsDir = "hmm/results"
# crisprFiles = load(open("pickles/CRISPRs.p","rb"))
# casOperons = CasOperons(gene)
# casOperons.hasCas9(hmmResultsDir,crisprFiles)
casOperons = load(open("pickles/Cas9_Operons_HMM.p", 'rb'))

#Get unique chrs and the proteins they are associated with
allCasAsmFile = "assemblies/All_%s_Unique_Assemblies.fasta" % (gene)
allCasAAsFile = "proteins/All_%s-Like.faa" % (gene)
casOperons.uniqueNukeSeqs(allCasAsmFile,
                          allCasAAsFile)  # Calls dump when it finishes

# Launch the domain search for the faa file created above
system(
    "sbatch /mnt/research/germs/shane/transActRNA/scripts/hpc/DomainSearch.sb")
casAAs = dict(index(allCasAAsFile, "fasta"))
unUsed = set(casOperons.seqMap.protToAsm).difference(casAAs)
deletedOperons = {}
pres, absnt, hasSeq = 0, 0, 0
for protID in unUsed:
    try:
        operon = casOperons.operons[casOperons.seqMap[protID]]
        deletedOperons[protID] = operon
    except:
        absnt += 1

from pickle import dump
dump(
    deletedOperons,
    open("/mnt/research/germs/shane/transActRNA/data/pickles/DeletedOperons.p",
         "wb"))
예제 #5
0
        return right_dist
    else:
        return -left_dist


if __name__ == "__main__":
    data = pd.read_table(sys.argv[1],
                         sep='\s+',
                         comment='#',
                         names=[
                             'feature_id', '_1', '_2', '_3', 'position', '_5',
                             '_6', '_7', 'score', '_9', '_10', '_11'
                         ],
                         index_col='feature_id',
                         usecols=['feature_id', 'position', 'score'])
    seqindex = index(sys.argv[2], 'fasta')

    for feature_id, (_position, score) in data.iterrows():
        position = int(_position)
        assert position == _position
        position -= 1
        seq = seqindex[feature_id].seq
        c_pos = relative_pos_closest(str(seq), position, 'C')
        if c_pos is None:
            # Print an empty string if there is no 'C'.
            c_pos = ''

        print(
            feature_id,
            position,
            round(score, 3),
예제 #6
0
                   type=int,
                   dest='min_length',
                   default=1000,
                   help="Minimum contig length to output [%(default)s]")
    p.add_argument('seq_path',
                   type=str,
                   metavar="FASTA",
                   help="Sequences to be trimmed.")
    p.add_argument(
        'corr_handle',
        type=argparse.FileType('r'),
        metavar="CORR",
        help="Correlation table from calculate_per_position_stats.py")
    args = p.parse_args()

    seqs = index(args.seq_path, 'fasta')
    data = pd.read_table(
        args.corr_handle,
        names=['contig_id', 'position', 'total_depth', 'cosine_similarity'])
    data.contig_id = data.contig_id.astype(str)  # FIXME: Brittle
    data.position = data.position - 1  # Convert to zero-indexed.
    tally_seqs = 0
    tally_nucs = 0
    for contig_id in tqdm(list(seqs.keys())):
        seq = seqs[contig_id].seq
        if len(seq) < args.min_length:
            # print(f"Contig {contig_id} too short.", file=sys.stderr)
            continue
        d = data[data.contig_id == contig_id]
        if d.empty:
            print("\rWARNING: {} not found in corr data.".format(contig_id),
예제 #7
0
#!/usr/bin/env python3

from Bio.SeqIO import index
from copy import deepcopy
import sys

if __name__ == "__main__":
    seq_index = index(sys.argv[1], 'fasta')
    for frag in sys.argv[2:]:
        seq_id, *indices = frag.rsplit(':', 1)
        rec = deepcopy(seq_index[seq_id])
        if indices:
            left, right = indices[0].split('-')
            left = int(left)
            right = int(right)
        else:
            left, right = 0, len(rec)
        if left > right:
            rec.seq = rec.seq[right:left].reverse_complement()
        else:
            rec.seq = rec.seq[left:right]
        print(f'>{rec.id}\n{rec.seq}')
예제 #8
0
for codon in TRANSLATION:
    POSITIONS[codon] = _count_positions(codon)


def count_positions(codon):
    return POSITIONS[codon]


if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument('align1', metavar='FASTA1')
    p.add_argument('align2', metavar='FASTA2', nargs='?')
    args = p.parse_args()

    if args.align2:
        rec_index1 = index(args.align1, 'fasta')
        rec_index2 = index(args.align2, 'fasta')
        comparisons = []
        for idA, idB in zip(rec_index1.keys(), rec_index2.keys()):
            comparisons.append((idA, rec_index1[idA], idB, rec_index2[idB]))
    else:
        rec_index = index(args.align1, 'fasta')
        comparisons = []
        ids = list(rec_index.keys())
        for i, idA in enumerate(ids):
            for j, idB in enumerate(ids[i + 1:]):
                comparisons.append((idA, rec_index[idA], idB, rec_index[idB]))

    for idA, recA, idB, recB in comparisons:
        seqA = recA.seq
        seqB = recB.seq
예제 #9
0
gene = "Cas9"
assemblyDir = "assemblies/assemblies_W_%s/" % (gene)
cas9Assemblies = listdir(assemblyDir)
goodDomIDS = load(open("pickles/%s_GoodDomainIDS.p" % (gene), "rb"))
goodDomMap = load(open("pickles/%s_GoodDomMap.p" % (gene), "rb"))
hmm_parser = load(open("pickles/%s_HMM_Parsing_Results.p" % (gene), "rb"))

print("All loaded")
#Copy unique nucleotide sequence
from Bio.SeqIO import index
nukSeqHash, protSeqHash = set(), set()
alreadyGotIt, count = 0, 0
for assembly in cas9Assemblies:
    baseID = assembly[:-6]
    allAssemblySeqs = index(assemblyDir + assembly, "fasta")
    overlap = goodDomIDS.intersection(allAssemblySeqs.keys())
    for recID in overlap:
        seq = str(allAssemblySeqs[recID].seq).upper()
        if seq in nukSeqHash and len(goodDomMap[recID]) == 1:
            alreadyGotIt += 1
            continue
        nukSeqHash.add(seq)
        #There may be more than 1 protein on the pseudochromosome, save both as separate files
        if len(goodDomMap[recID]) > 1:
            print("%i Cas9s on %s %s" %
                  (len(goodDomMap[recID]), recID, baseID))
        for orfID in goodDomMap[recID]:
            #                 protSeq = str(hmm_parser.results[baseID].proteins[orfID].seq).upper()
            with open("assemblies/pseudoChromos/%s.fasta" % (orfID),
                      "w") as fh: