Exemple #1
 def __init__(self, fwd, rev, parent=None):
     # FASTA objects #
     self.fwd = FASTA(fwd)
     self.rev = FASTA(rev)
     # Extra #
     self.gzipped = self.fwd.gzipped
     self.parent = parent
Exemple #2
 def __init__(self, fwd, rev, parent=None):
     # FASTA objects #
     self.fwd = FASTA(fwd)
     self.rev = FASTA(rev)
     # Extra #
     self.gzipped = self.fwd.gzipped
     self.parent = parent
Exemple #3
 def __init__(
     query_path,  # The input sequences
     db_path=pfam.hmm_db,  # The database to search
     seq_type='prot' or 'nucl',  # The seq type of the query_path file
     e_value=0.001,  # The search threshold
     params=None,  # Add extra params for the command line
     out_path=None,  # Where the results will be dropped
     executable=None,  # If you want a specific binary give the path
     cpus=None):  # The number of threads to use
     # Save attributes #
     self.query = FASTA(query_path)
     self.db = FilePath(db_path)
     self.params = params if params else {}
     self.e_value = e_value
     self.seq_type = seq_type
     self.executable = FilePath(executable)
     # Cores to use #
     if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32)
     else: self.cpus = cpus
     # Auto detect database short name #
     if db_path == 'pfam': self.db = pfam.hmm_db
     if db_path == 'tigrfam': self.db = tigrfam.hmm_db
     # Output #
     if out_path is None:
         self.out_path = FilePath(self.query.prefix_path + '.hmmout')
     elif out_path.endswith('/'):
         self.out_path = FilePath(out_path + self.query.prefix + '.hmmout')
         self.out_path = FilePath(out_path)
Exemple #4
def generate_values(path, progress=False):
    seqs = SeqIO.parse(path, 'fasta')
    if not progress:
        for seq in seqs: yield (seq.id, seq.description, str(seq.seq))
    if progress:
        for seq in tqdm(GenWithLength(seqs, len(FASTA(path)))):
            yield (seq.id, seq.description, str(seq.seq))
Exemple #5
 def load(self):
     """A second __init__ that is delayed and called only if needed"""
     # Load the pools and samples #
     for p in self.pools: p.load()
     for s in self.samples: s.load()
     # Dir #
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Figure out if it's a project #
     if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project
     else: self.project = None
     # Runner #
     self.runner = ClusterRunner(self)
     # FASTA #
     self.reads = FASTA(self.p.all_reads_fasta)
     # OTU picking #
     self.otu_uparse = UparseOTUs(self)
     self.otu_uclust = UclustOTUs(self)
     self.otu_cdhit  = CdhitOTUs(self)
     # Preferred #
     self.otus = self.otu_uparse
     # Simple reporting #
     self.reporter = ClusterReporter(self)
     # Full report #
     self.report = ClusterReport(self)
     # Loaded #
     self.loaded = True
     # Return self for convenience #
     return self
Exemple #6
 def load(self):
     """A second __init__ that is delayed and called only if needed"""
     # Check files are there #
     for f in self.sff_files_info:
         if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path'])
     # Automatic paths #
     self.base_dir = self.out_dir + self.id_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Make an alias to the json #
     self.p.info_json.link_from(self.json_path, safe=True)
     # Primer #
     self.primer_regex = re.compile(self.info['primer'])
     # Raw files #
     self.raw_fasta = FASTA(self.p.raw_fasta)
     self.raw_fastq = FASTQ(self.p.raw_fastq)
     # Standard FASTA #
     self.reads = FASTA(self.p.reads_fasta)
     self.fasta = FASTA(self.p.renamed)
     # Special FASTQ #
     self.fastq = FASTQ(self.p.reads_fastq)
     # A shameless hack for cdhit to work #
     self.renamed = self.fastq
     # Pre-denoised special case #
     if self.info['predenoised'] and False:
         self.sff_files_info = []
         self.reads.link_from(self.info['predenoised'], safe=True)
     # Special submission attributes #
     self.sra = PyroSampleSRA(self)
     # Loaded #
     self.loaded = True
     # Return self for convenience #
     return self
Exemple #7
    def read_file(self, fp):
        Read the first FASTA record from the content of fp,
        and set the chromosome name and sequence using set_chromosome method.
        if self.verbose:
            print >> stderr, "reading a FASTA record to set a chromosome"
        fasta = FASTA(fp=fp, verbose=self.verbose)
        chr_name, chr_seq = fasta.get_record()

        if chr_name and chr_seq:
            chr_name = chr_name[1:]
            self.set_chromosome(chr_name, chr_seq)
        elif not chr_name and not chr_seq:
            raise NoChromosomeFoundError(fp.name, chr_name, chr_seq)
            raise ChromosomeFASTAFromatError(fp.name, chr_name, chr_seq)
Exemple #8
    def read_file(self, fp):
        Read the first FASTA record from the content of fp,
        and set the chromosome name and sequence using set_chromosome method.
        if self.verbose:
            print >> stderr, "reading a FASTA record to set a chromosome"
        fasta = FASTA(fp=fp, verbose=self.verbose)
        chr_name, chr_seq = fasta.get_record()

        if chr_name and chr_seq:
            chr_name = chr_name[1:]
            self.set_chromosome(chr_name, chr_seq)
        elif not chr_name and not chr_seq:
            raise NoChromosomeFoundError(fp.name, chr_name, chr_seq)
            raise ChromosomeFASTAFromatError(fp.name, chr_name, chr_seq)
Exemple #9
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         for gene in self.filtered_genes:
             fasta.add_str(str(gene), name=gene.name)
     return fasta
Exemple #10
def main():

    args = parse_args()
    dihedrals = read_dihedrals()

    fasta = FASTA(args.fasta)
    (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta)
    #grooves = readGrooves(args.grooves, mhcSeq, peptides)
    universalGrooves = universalGroove(args.grooves, mhcSeq, peptides)
    intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides)
    #for u in universalGrooves:
    #    print (u, universalGrooves[u])

    #for u in intersectGrooves:
    #    print (intersectGrooves[u])

    labels = read_rmsd_file(args.rms)
    pdbids = read_datafile(args.t)

    outputfilehandler = open(args.pdbids, 'w')

    for pdbid in pdbids:
        if pdbid in dihedrals:
            if args.pep:
                finalSeqCode = oneHotEncoding(peptides[pdbid])
                finalLabelCode = dihedrals[pdbid]
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                    outputfilehandler.write(pdbid + '\n')
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
                    outputfilehandler.write(pdbid + '\n')
                finalSeqCode = oneHotEncoding(universalGrooves[pdbid] +
                finalLabelCode = dihedrals[pdbid]
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                    outputfilehandler.write(pdbid + '\n')
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
                    outputfilehandler.write(pdbid + '\n')

Exemple #11
class Silva(Database):
    """SILVA provides comprehensive, quality checked and regularly updated datasets of aligned small (16S/18S, SSU) and large subunit (23S/28S, LSU) ribosomal RNA (rRNA) sequences for all three domains of life (Bacteria, Archaea and Eukarya). SILVA are the official databases of the software package ARB.


    To install:
        from seqsearch.databases.silva import silva

    It will put it in ~/databases/silva_xxx/

    view_url   = "https://www.arb-silva.de/no_cache/download/archive/"
    base_url   = "https://www.arb-silva.de/fileadmin/silva_databases/"
    short_name = "silva"

    all_paths = """

    def __init__(self, version, seq_type, base_dir=None):
        # Attributes #
        self.version    = version
        self.seq_type   = seq_type
        self.short_name = self.short_name + "_" + self.version
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # URL #
        self.url  = "release_%s/Exports/"  % self.version
        # The database #
        self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version
        self.nr99_dest = FASTA(self.base_dir + self.nr99_name)
        self.nr99      = FASTA(self.base_dir + self.nr99_name[:-3])
        # The alignment #
        self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version
        self.aligned_dest = FASTA(self.base_dir + self.aligned_name)
        self.aligned      = FASTA(self.base_dir + self.aligned_name[:-3])

    def download(self):
        print "\nDownloading", self.base_url + self.url + self.nr99_name
        wget.download(self.base_url + self.url + self.nr99_name,    out=self.nr99_dest.path)
        print "\nDownloading", self.base_url + self.url + self.aligned_name
        wget.download(self.base_url + self.url + self.aligned_name, out=self.aligned_dest.path)

    def unzip(self):
 def to_fasta(self, path, verbose=False):
     # Select verbosity #
     import tqdm
     wrapper = tqdm.tqdm if verbose else lambda x: x
     # Do it #
     with open(path, 'w') as handle:
         for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
     # Return #
     return FASTA(path)
Exemple #13
 def fresh_fasta(self):
     """A file containing all the fresh water genes"""
     fasta = FASTA(self.p.fresh_fasta)
     if not fasta.exists:
         print "Building fasta file with all fresh genes..."
         fresh = [g for g in genomes.values() if g.fresh]
         shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta))
         assert len(fasta) == sum(map(len, fresh))
     return fasta
Exemple #14
 def fasta(self):
     """Make a fasta file with all uniprot proteins that are related to
     this family."""
     fasta = FASTA(self.p.proteins)
     if not fasta.exists:
         for seq in pfam.fasta:
             if self.fam_name in seq.description: fasta.add_seq(seq)
         assert fasta
     # Return #
     return fasta
Exemple #15
 def __init__(self, version, seq_type, base_dir=None):
     # Attributes #
     self.version    = version
     self.seq_type   = seq_type
     self.short_name = self.short_name + "_" + self.version
     # Base directory #
     if base_dir is None: base_dir = home
     self.base_dir = base_dir + 'databases/' + self.short_name + '/'
     self.p        = AutoPaths(self.base_dir, self.all_paths)
     # URL #
     self.url  = "release_%s/Exports/"  % self.version
     # The database #
     self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version
     self.nr99_dest = FASTA(self.base_dir + self.nr99_name)
     self.nr99      = FASTA(self.base_dir + self.nr99_name[:-3])
     # The alignment #
     self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version
     self.aligned_dest = FASTA(self.base_dir + self.aligned_name)
     self.aligned      = FASTA(self.base_dir + self.aligned_name[:-3])
Exemple #16
 def __init__(self, base_dir=None):
     # Base directory #
     if base_dir is None: base_dir = home
     self.base_dir = base_dir + 'databases/' + self.short_name + '/'
     self.p        = AutoPaths(self.base_dir, self.all_paths)
     # The results #
     self.alignment = FASTA(self.p.mothur_fasta)
     self.taxonomy  = FilePath(self.p.mothur_tax)
     # The part that mothur will use for naming files #
     self.nickname = "foram_mothur"
Exemple #17
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta)
    #grooves = readGrooves(args.grooves, mhcSeq, peptides)
    universalGrooves = universalGroove(args.grooves, mhcSeq, peptides)
    intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides)
    labels = read_rmsd_file(args.rms)
    pdbids = read_datafile(args.t)

    aaindex = Aaindex()
    #for result in aaindex.search('charge'):
    #    print(result)

    record = aaindex.get('FASG890101')
    #print (record.title)
    index_data = record.index_data
    #print (index_data)

    charge = aaindex.get('KLEP840101')
    charge_data = charge.index_data
    #print (charge_data)

    for l in labels:
        (pdbid1, pdbid2) = l.split('_')
        #if pdbid1 in pdbids and pdbid2 in pdbids:
        if pdbid1 in pdbids or pdbid2 in pdbids:
            if args.pep:
                finalSeqCode, finalLabelCode = oneHotEncoding(peptides[pdbid1]+'|'+peptides[pdbid2], labels[l], index_data, charge_data)
                if args.label == 'x':
                    print (', '.join(finalSeqCode))
                elif args.label == 'y':
                    print (', '.join(finalLabelCode))
                finalSeqCode, finalLabelCode = oneHotEncoding(universalGrooves[pdbid1]+peptides[pdbid1]+'|'+universalGrooves[pdbid2]+peptides[pdbid2], labels[l], index_data, charge_data)
                if args.label == 'x':
                    print (', '.join(finalSeqCode))
                elif args.label == 'y':
                    print (', '.join(finalLabelCode))
Exemple #18
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         for gene in self.filtered_genes: fasta.add_str(str(gene), name=gene.name)
     return fasta
Exemple #19
 def __init__(self, path, parent):
     # Save parent #
     self.parent, self.pool = parent, parent
     self.samples = parent.samples
     # Auto paths #
     self.base_dir = parent.p.quality_dir + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Files #
     self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
     self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
     self.trimmed = FASTA(self.p.trimmed)
     # Qiime output #
     self.qiime_fasta = FASTA(self.p.qiime_fasta)
     # Mothur #
     self.mothur_fasta = FASTA(self.p.mothur_fasta)
     self.mothur_qual = QualFile(self.p.mothur_qual)
     self.mothur_groups = FilePath(self.p.mothur_groups)
     # Primer size #
     self.trim_fwd = self.pool.samples.trim_fwd
     self.trim_rev = self.pool.samples.trim_rev
Exemple #20
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    peptides, alleles = totalNineMers(fasta)
    pdbids = peptides.keys()
    testsetlen = int(args.percent * len(pdbids))

    trainset = []
    testset = []
    for p in pdbids:
        r = random()
        if len(testset) < testsetlen and r < 0.5 and alleles[p] == 'A0201':

    write_to_file('train.txt', trainset)
    write_to_file('test.txt', testset)
Exemple #21
 def set_size(self, length):
     """Trim all sequences to a specific length starting from the end."""
     self.size_trimmed = FASTA(new_temp_path())
     def trim_iterator(reads):
         for read in reads:
             if len(read) < length: continue
             yield read[-length:]
     # Replace it #
     shutil.move(self.size_trimmed, self.reads)
Exemple #22
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    peptides = totalNineMers(fasta)
    pdbids = peptides.keys()
    testsetlen = int(args.percent * len(pdbids))

    trainset = []
    testset = []
    for i in range(0, len(pdbids)):
        r = random()
        if len(testset) < testsetlen and r < 0.5:

    write_to_file('train/90_10/train.txt', trainset)
    write_to_file('test/90_10/test.txt', testset)
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta)
    #grooves = readGrooves(args.grooves, mhcSeq, peptides)
    universalGrooves = universalGroove(args.grooves, mhcSeq, peptides)
    intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides)
    #for u in universalGrooves:
    #    print (u, universalGrooves[u])

    #for u in intersectGrooves:
    #    print (intersectGrooves[u])

    labels = read_rmsd_file(args.rms)
    pdbids = read_datafile(args.t)

    for l in labels:
        (pdbid1, pdbid2) = l.split('_')
        #if pdbid1 in pdbids and pdbid2 in pdbids:
        if pdbid1 in pdbids or pdbid2 in pdbids:
            if args.pep:
                finalSeqCode, finalLabelCode = oneHotEncoding(
                    peptides[pdbid1] + '|' + peptides[pdbid2], labels[l])
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
                finalSeqCode, finalLabelCode = oneHotEncoding(
                    universalGrooves[pdbid1] + peptides[pdbid1] + '|' +
                    universalGrooves[pdbid2] + peptides[pdbid2], labels[l])
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
Exemple #24
 def fasta(self):
     """Make a fasta file with all uniprot proteins that are related to
     this family."""
     fasta = FASTA(self.p.proteins)
     if not fasta.exists:
         for seq in pfam.fasta:
             if self.fam_name in seq.description: fasta.add_seq(seq)
         assert fasta
     # Return #
     return fasta
Exemple #25
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + "/"
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main reads file here FASTQ #
     self.reads = FASTQ(self.p.all_reads)
     # Files #
     self.cdhit_clusters = FilePath(self.p.clstr)
     self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU")
     self.centers = FASTA(self.p.centers)
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, "silvamod", self.p.silva)
     self.taxonomy_fw = CrestTaxonomy(self.centers, self, "freshwater", self.p.fw_dir)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
Exemple #26
 def __init__(self,
              seq_type     = 'prot' or 'nucl',     # The seq type of the query_path file
              params       = None,                 # Add extra params for the command line
              algorithm    = "blastn" or "blastp", # Will be auto-determined with seq_type
              out_path     = None,                 # Where the results will be dropped
              executable   = None,                 # If you want a specific binary give the path
              cpus         = None,                 # The number of threads to use
              num          = None,                 # When parallelized, the number of this thread
              _out         = None,                 # Store the stdout at this path
              _err         = None):                # Store the stderr at this path
     # Main input #
     self.query = FASTA(query_path)
     # The database to search against #
     self.db = FilePath(db_path)
     # Other attributes #
     self.seq_type     = seq_type
     self.algorithm    = algorithm
     self.num          = num
     self.params       = params if params else {}
     # The standard output and error #
     self._out         = _out
     self._err         = _err
     # Output defaults #
     if out_path is None:
         self.out_path = self.query.prefix_path + self.extension
     elif out_path.endswith('/'):
         self.out_path = out_path + self.query.prefix + self.extension
         self.out_path = out_path
     # Make it a file path #
     self.out_path = FilePath(self.out_path)
     # Executable #
     self.executable = FilePath(executable)
     # Cores to use #
     if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32)
     else:            self.cpus = cpus
     # Save the output somewhere #
     if self._out is True:
         self._out = self.out_path + '.stdout'
     if self._err is True:
         self._err = self.out_path + '.stderr'
Exemple #27
 def __init__(self, parent):
     # Save parent #
     self.parent, self.assemble_group = parent, parent
     self.samples = parent.samples
     self.pool    = self.parent.pool
     self.primers = self.pool.primers
     # Auto paths #
     self.base_dir = parent.p.groups_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # More #
     self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples)
     self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples)
     # Quality filtered #
     if self.parent == 'assembled':
         self.qual_filtered = BarcodedFASTQ(self.p.qual_filtered,      samples=self.samples, primers=self.primers)
         self.len_filtered =  BarcodedFASTQ(self.p.len_filtered_fastq, samples=self.samples, primers=self.primers)
         self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes)
     # Further #
Exemple #28
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main FASTA file #
     self.reads = self.parent.reads
     # Files #
     self.all_otus = FilePath(self.p.all_otus)
     self.all_centers = FASTA(self.p.all_centers)
     self.otus = FilePath(self.base_dir + "otus.txt")
     self.centers = FASTA(self.base_dir + "centers.fasta")
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
     self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
Exemple #29
 def __init__(self, path, num_parts=None, part_size=None, base_dir=None):
     # Basic #
     self.path = path
     # Directory #
     if base_dir is None: self.base_dir = path + '.parts/'
     else: self.base_dir = base_dir
     # Num parts #
     if num_parts is not None: self.num_parts = num_parts
     # Evaluate size #
     if part_size is not None:
         self.bytes_target = humanfriendly.parse_size(part_size)
         self.num_parts = int(
             math.ceil(self.count_bytes / self.bytes_target))
     # Make parts #
     self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i
     self.parts = [
         FASTA(self.make_name(i)) for i in range(1, self.num_parts + 1)
     # Give a number to each part #
     for i, part in enumerate(self.parts):
         part.num = i
Exemple #30
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.add_str(seq, "My test sequence")
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     # Print result #
     print("Success", directory)
Exemple #31
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main FASTA file #
     self.reads = self.parent.reads
     # Files #
     self.derep = SizesFASTA(self.p.derep)
     self.sorted = SizesFASTA(self.p.sorted)
     self.centers = FASTA(self.p.centers)
     self.readmap = UClusterFile(self.p.readmap)
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod',   self.p.silva_dir)
     self.taxonomy_fw    = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
     self.taxonomy_unite = CrestTaxonomy(self.centers, self, 'unite',      self.p.unite_dir)
     self.taxonomy_rdp   = RdpTaxonomy(self.centers, self)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
     # Source tracking #
     self.seqenv = Seqenv(self)
Exemple #32
def read_fasta(args):

    fasta = FASTA(args.fasta)
    headers = fasta.get_headers()
    pep_chain = {}
    pep_seq = {}

    for header in headers:
        fields = header.split('|')
        pdbid = fields[0]
        chainid = fields[1]
        seq = fasta.get_sequence(header)

        if len(seq) == 9:
            pep_chain[pdbid] = chainid
            pep_seq[pdbid] = seq

    return (pep_chain, pep_seq)
Exemple #33
 def __init__(self, version, seq_type, base_dir=None):
     # Attributes #
     self.version    = version
     self.seq_type   = seq_type
     self.short_name = self.short_name + "_" + self.version
     # Base directory #
     if base_dir is None: base_dir = home
     self.base_dir = base_dir + 'databases/' + self.short_name + '/'
     self.p        = AutoPaths(self.base_dir, self.all_paths)
     # URL #
     self.url  = "release_%s/Exports/"  % self.version
     # The database #
     self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version
     self.nr99_dest = FASTA(self.base_dir + self.nr99_name)
     self.nr99      = FASTA(self.base_dir + self.nr99_name[:-3])
     # The alignment #
     self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version
     self.aligned_dest = FASTA(self.base_dir + self.aligned_name)
     self.aligned      = FASTA(self.base_dir + self.aligned_name[:-3])
Exemple #34
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.add_str(seq, "My test sequence")
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     # Print result #
     print "Success", directory
Exemple #35
 def to_fasta(self, path):
     with open(path, 'w') as handle:
         for r in self:
             SeqIO.write(r, handle, 'fasta')
     return FASTA(path)
Exemple #36
 def all_proteins(self):
     """The main fasta file."""
     return FASTA(self.p.unzipped_proteins)
Exemple #37
class PairedFASTA(object):
    """Read and write FASTA file pairs without using too much RAM"""
    format = 'fasta'

    def __len__(self): return self.count
    def __iter__(self): return self.parse()
    def __nonzero__(self): return bool(self.fwd) and bool(self.rev)
    def __repr__(self): return '<%s object on "%s" and "%s">' % \
                        (self.__class__.__name__, self.fwd.path, self.rev.path)

    def __enter__(self): return self.create()
    def __exit__(self, exc_type, exc_value, traceback): self.close()

    def exists(self): return self.fwd.exists and self.rev.exists

    def __init__(self, fwd, rev, parent=None):
        # FASTA objects #
        self.fwd = FASTA(fwd)
        self.rev = FASTA(rev)
        # Extra #
        self.gzipped = self.fwd.gzipped
        self.parent = parent

    def count(self):
        assert self.fwd.count == self.rev.count
        return self.fwd.count

    def open(self):

    def parse(self):
        return izip(self.fwd.parse(), self.rev.parse())

    def close(self):

    def create(self):
        return self

    def add(self, f, r):
        return self.add_pair((f,r))

    def add_pair(self, pair):

    def remove(self):

    def progress(self):
        """Just like self.parse but display a progress bar"""
        return tqdm(self, total=len(self))

    def subsample(self, down_to, dest_pair=None):
        # Check size #
        assert down_to < len(self)
        # Make new pair of files #
        if dest_pair is None:
            dest_fwd_path = self.fwd_path.new_name_insert("subsampled")
            dest_rev_path = self.rev_path.new_name_insert("subsampled")
            dest_pair = self.__class__(dest_fwd_path, dest_rev_path)
        # Do it #
        for pair in isubsample(self, down_to): dest_pair.add_pair(pair)
        # Did it work #
        assert len(dest_pair) == down_to

    #------------------------------- Extensions ------------------------------#
    def parse_primers(self, *args, **kwargs):
        fwd_gen = self.fwd.parse_primers(*args, **kwargs)
        rev_gen = self.rev.parse_primers(*args, **kwargs)
        generator = izip(fwd_gen, rev_gen)
        return GenWithLength(generator, len(fwd_gen))
Exemple #38

# Get statistics #

# Get clustering values #
r1, r2 = list(set([p.run for p in proj]))
print float(r1.report_stats['fwd']['DensityPF']) / float(r1.report_stats['fwd']['DensityRaw'])
print float(r2.report_stats['fwd']['DensityPF']) / float(r2.report_stats['fwd']['DensityRaw'])

# Check below 400 bp sequences #
folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "below_400/")
over = FASTA(folder + "reads.fasta")
def over_iterator(reads, max_length=400):
    for read in reads:
        if len(read) <= max_length: yield read
for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered))
crest = SimpleCrestTaxonomy(over, folder)
rdp = SimpleRdpTaxonomy(over, folder)

# Check unassembled mate pairs #
Exemple #39
class Foraminifera(Database):
    """This is a custom database containing exlcusively Foraminifera sequences.


    You should place the file "foram_db_cor.fasta" in:  ~/databases/foraminifera/
    Then you can run this:
            from seqsearch.databases.foraminifera import foraminifera
            print foraminifera.tax_depth_freq


    short_name = "foraminifera"
    long_name  = 'The custom made Foraminifera database as received by email on 7th April 2017'

    all_paths = """

    def rank_names(self):
        """The names of the ranks. Total 9 ranks."""
        return ['Domain',   # 0
                'Kingdom',  # 1
                'Phylum',   # 2
                'Class',    # 3
                'Order',    # 4
                'Family',   # 5
                'Tribe',    # 6
                'Genus',    # 7
                'Species']  # 8

    def __init__(self, base_dir=None):
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # The results #
        self.alignment = FASTA(self.p.mothur_fasta)
        self.taxonomy  = FilePath(self.p.mothur_tax)
        # The part that mothur will use for naming files #
        self.nickname = "foram_mothur"

    def process(self):
        # The file that was received by email without documentation T_T #
        raw = FASTA(self.p.cor)
        # Open files #
        # Loop #
        for seq in raw:
            # Parse #
            name = seq.id[11:].split('|')
            num  = name.pop(0)
            # Check #
            for x in name: assert ';' not in x
            for x in name: assert '\t' not in x
            # Make ranks #
            ranks = ['Eukaryota'                       , # 0 Domain
                     'Rhizaria'                        , # 1 Kingdom
                     'Foraminifera'                    , # 2 Phylum
                     name[0]                           , # 3 Class
                     name[1]                           , # 4 Order
                     name[2]                           , # 5 Family
                     name[3]                           , # 6 Tribe
                     name[4]                           , # 7 Genus
                     name[5]]                            # 8 Species
            # The taxonomy string #
            tax_line = ';'.join(ranks)
            # Add sequence to the new fasta file #
            self.alignment.add_str(str(seq.seq), name="foram" + num)
            # Add the taxonomy to the tax file #
            self.taxonomy.add_str("foram" + num + '\t' + tax_line + '\n')
        # Close files #
Exemple #40
 def fasta(self):
     fasta = FASTA(self.autopaths.fasta)
     return fasta
Exemple #41
 def combine_rerun_with_orig(self):
     """Special case when a sample with low reads was rerun in an other pool.
     Run this just before the combine_reads() method of the associated cluster.
     This method is called on the reruned sampled, not the original."""
     # Check we have a rerun #
     if self.info.get('rerun') is None: return False
     # Check we are processed #
     assert self.fasta.count > 0
     # Get the original sample #
     run, pool, num = self.info['rerun']['run'], self.info['rerun']['pool'], self.info['rerun']['num']
     orig_sample    = illumitag.runs[run][pool-1][num-1]
     merged         = FASTA(orig_sample.base_dir + 'rerun_merged.fasta')
     # Check we don't merge twice #
     assert orig_sample.count == orig_sample.fasta.count
     # Do it #
     merged.rename_with_num(orig_sample.name + '_read', orig_sample.fasta)
     # Check #
     orig_sample.fasta = FASTA(orig_sample.fasta.path)
     assert orig_sample.count < orig_sample.fasta.count
     return True
Exemple #42
class Cluster(object):
    """Analyzes a group of samples."""

    all_paths = """

    def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples))
    def __iter__(self): return iter(self.samples)
    def __len__(self): return len(self.samples)
    def __getitem__(self, key):
        if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0]
        elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0]
        else: return self.children[key]

    def first(self): return self.children[0]

    def count_seq(self):
        return sum([len(sample) for sample in self])

    def __init__(self, samples, name, base_dir=None):
        # Save samples #
        self.name = name
        self.samples, self.children = samples, samples
        # Check names are unique #
        names = [s.short_name for s in samples if s.used]
        assert len(names) == len(set(names))
        # Figure out pools #
        self.pools = list(set([s.pool for s in self.samples]))
        self.pools.sort(key = lambda x: x.id_name)
        # Directory #
        if base_dir: self.base_dir = base_dir
        else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Load the pools and samples #
        for p in self.pools: p.load()
        for s in self.samples: s.load()
        # Dir #
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Figure out if it's a project #
        if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project
        else: self.project = None
        # Runner #
        self.runner = ClusterRunner(self)
        # FASTA #
        self.reads = FASTA(self.p.all_reads_fasta)
        # OTU picking #
        self.otu_uparse = UparseOTUs(self)
        self.otu_uclust = UclustOTUs(self)
        self.otu_cdhit  = CdhitOTUs(self)
        # Preferred #
        self.otus = self.otu_uparse
        # Simple reporting #
        self.reporter = ClusterReporter(self)
        # Full report #
        self.report = ClusterReport(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    def run(self, *args, **kwargs):
        self.runner.run(*args, **kwargs)

    def run_slurm(self, *args, **kwargs):
        self.runner.run_slurm(*args, **kwargs)

    def process_samples(self):
        for sample in tqdm(self): sample.process()

    def combine_reads(self):
        """This is the first function should call. It will combine all the
        reads of all the samples of this cluster into one big FASTA file."""
        paths = [sample.fasta.path for sample in self]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))
        return self.reads

    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end."""
        self.size_trimmed = FASTA(new_temp_path())
        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]
        # Replace it #
        shutil.move(self.size_trimmed, self.reads)

    def run_uparse(self): self.otu_uparse.run()

    def metadata(self):
        return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self])

    def export_metadata(self):
        self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
Exemple #43
from fasta import FASTA, AlignedFASTA
community = FASTA('community.fasta')
alignment = AlignedFASTA('alignment.fasta')
Exemple #44
class PrimerGroup(object):
    """A bunch of sequences all having the same type of primer outcome
    (and assembly outcome)"""

    all_paths = """

    qual_threshold = 5
    qual_windowsize = 10
    min_length = 400

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.orig_reads)

    def create(self): self.orig_reads.create()
    def add_seq(self, read): self.orig_reads.add_seq(read)
    def close(self): self.orig_reads.close()

    def __init__(self, parent):
        # Save parent #
        self.parent, self.assemble_group = parent, parent
        self.samples = parent.samples
        self.pool    = self.parent.pool
        self.primers = self.pool.primers
        # Auto paths #
        self.base_dir = parent.p.groups_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # More #
        self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples)
        self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples)
        # Quality filtered #
        if self.parent == 'assembled':
            self.qual_filtered = BarcodedFASTQ(self.p.qual_filtered,      samples=self.samples, primers=self.primers)
            self.len_filtered =  BarcodedFASTQ(self.p.len_filtered_fastq, samples=self.samples, primers=self.primers)
            self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes)
        # Further #

    def load(self): pass

    def n_filter(self):
        """Called from AssembleGroup.discard_reads_with_n"""
        def no_n_iterator(reads):
            fwd_len = self.pool.primers.fwd_len
            rev_len = self.pool.primers.rev_len
            for read in reads:
                if 'N' in read[fwd_len:-rev_len]: continue
                yield read

    def qual_filter(self):
        """Called from Assemble.quality_filter"""
        def good_qual_iterator(reads):
            for read in reads:
                averaged = moving_average(read.letter_annotations["phred_quality"], self.qual_windowsize)
                if any([value < self.qual_threshold for value in averaged]): continue
                yield read

    def len_filter(self):
        """Called from Assemble.length_filter"""
        def good_len_iterator(reads):
            for read in reads:
                if len(read) < self.min_length: continue
                yield read

    def trim_bc(self):
        """Called from Assemble.trim_barcodes"""
        def no_barcodes_iterator(reads):
            for read in reads:
                yield read[self.pool.bar_len:-self.pool.bar_len]
        if self.pool.bar_len == 0:
Exemple #45
    'organism': 'TcruziCLBrenerEsmeraldo-like'
non_emeraldo = {
    'organism': 'TcruziCLBrenerNon-Esmeraldo-like'

organism = emeraldo_like

if __name__ == "__main__":
    # Load FASTA files
    genome = FASTA(organism['genome_filename'])

    regions = FASTA(organism['regions_filename'])

    # Load database file
    sqlite = sqlite3.connect(SQLite_DB)

    # Create MFASeq Folder
    Organism_MFASeq_folder = f"{MFASeq_folder}/MFA-Seq_{organism['organism']}"
    if not os.path.isdir(Organism_MFASeq_folder):

    # Create MFASeq Files
    for chromosome_id in genome.data.keys():
Exemple #46
class QualityReads(object):
    """A set of sequences determined to be quality controlled"""

    all_paths = """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.trimmed)

    def __init__(self, path, parent):
        # Save parent #
        self.parent, self.pool = parent, parent
        self.samples = parent.samples
        # Auto paths #
        self.base_dir = parent.p.quality_dir + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Files #
        self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
        self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
        self.trimmed = FASTA(self.p.trimmed)
        # Qiime output #
        self.qiime_fasta = FASTA(self.p.qiime_fasta)
        # Mothur #
        self.mothur_fasta = FASTA(self.p.mothur_fasta)
        self.mothur_qual = QualFile(self.p.mothur_qual)
        self.mothur_groups = FilePath(self.p.mothur_groups)
        # Primer size #
        self.trim_fwd = self.pool.samples.trim_fwd
        self.trim_rev = self.pool.samples.trim_rev

    def filter_unused(self):
        def no_unused_iterator(reads):
            for r in reads.parse_barcodes():
                if r.first.sample.used: yield r.read

    def trim_primers(self):
        def no_primers_iterator(reads):
            for read in reads:
                yield read[self.trim_fwd:-self.trim_rev]

    def make_mothur_output(self):
        # Trimmed fasta #
        # The groups file #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            read_name = '%s\t%s\n' % (r.read.id, sample_name)

    def make_qiime_output(self):
        # Prepare fasta writer #
        handle = open(self.qiime_fasta.path, 'w')
        writer = FastaWriter(handle, wrap=0)
        # Counter #
        counter = defaultdict(int)
        # Do it #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            counter[sample_name] += 1
            r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
            bar_seq = r.read.seq[0:self.pool.bar_len]
            r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
        # Close #
Exemple #47
 def __init__(self, fasta_path, seq_type='nucl' or 'prot'):
     if hasattr(fasta_path, 'seq_type'): self.seq_type = fasta_path.seq_type
     else:                               self.seq_type = seq_type
     FASTA.__init__(self, fasta_path)
Exemple #48
class Silva(Database):
    SILVA provides comprehensive, quality checked and regularly updated
    datasets of aligned small (16S/18S, SSU) and large subunit (23S/28S, LSU)
    ribosomal RNA (rRNA) sequences for all three domains of life
    (Bacteria, Archaea and Eukarya).
    SILVA are the official databases of the software package ARB.


    To install:
        from seqsearch.databases.silva import silva

    It will put it in ~/databases/silva_xxx/

    view_url   = "https://www.arb-silva.de/no_cache/download/archive/"
    base_url   = "https://www.arb-silva.de/fileadmin/silva_databases/"
    short_name = "silva"

    all_paths = """

    def __init__(self, version, seq_type, base_dir=None):
        # Attributes #
        self.version    = version
        self.seq_type   = seq_type
        self.short_name = self.short_name + "_" + self.version
        # Base directory #
        if base_dir is None: base_dir = home
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.p        = AutoPaths(self.base_dir, self.all_paths)
        # URL #
        self.url  = "release_%s/Exports/"  % self.version
        # The database #
        self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version
        self.nr99_dest = FASTA(self.base_dir + self.nr99_name)
        self.nr99      = FASTA(self.base_dir + self.nr99_name[:-3])
        # The alignment #
        self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version
        self.aligned_dest = FASTA(self.base_dir + self.aligned_name)
        self.aligned      = FASTA(self.base_dir + self.aligned_name[:-3])

    def download(self):
        import wget
        print("\nDownloading", self.base_url + self.url + self.nr99_name)
        wget.download(self.base_url + self.url + self.nr99_name,    out=self.nr99_dest.path)
        print("\nDownloading", self.base_url + self.url + self.aligned_name)
        wget.download(self.base_url + self.url + self.aligned_name, out=self.aligned_dest.path)

    def unzip(self):
Exemple #49
class PairedFASTA(object):
    """Read and write FASTA file pairs without using too much RAM"""
    format = 'fasta'

    def __len__(self):
        return self.count

    def __iter__(self):
        return self.parse()

    def __nonzero__(self):
        return bool(self.fwd) and bool(self.rev)

    def __repr__(self):        return '<%s object on "%s" and "%s">' % \
        (self.__class__.__name__, self.fwd.path, self.rev.path)

    def __enter__(self):
        return self.create()

    def __exit__(self, exc_type, exc_value, traceback):

    def exists(self):
        return self.fwd.exists and self.rev.exists

    def __init__(self, fwd, rev, parent=None):
        # FASTA objects #
        self.fwd = FASTA(fwd)
        self.rev = FASTA(rev)
        # Extra #
        self.gzipped = self.fwd.gzipped
        self.parent = parent

    def count(self):
        assert self.fwd.count == self.rev.count
        return self.fwd.count

    def open(self):

    def parse(self):
        return izip(self.fwd.parse(), self.rev.parse())

    def close(self):

    def create(self):
        return self

    def add(self, f, r):
        return self.add_pair((f, r))

    def add_pair(self, pair):

    def remove(self):

    def progress(self):
        """Just like self.parse but display a progress bar"""
        return tqdm(self, total=len(self))

    def subsample(self, down_to, dest_pair=None):
        # Check size #
        assert down_to < len(self)
        # Make new pair of files #
        if dest_pair is None:
            dest_fwd_path = self.fwd_path.new_name_insert("subsampled")
            dest_rev_path = self.rev_path.new_name_insert("subsampled")
            dest_pair = self.__class__(dest_fwd_path, dest_rev_path)
        # Do it #
        for pair in isubsample(self, down_to):
        # Did it work #
        assert len(dest_pair) == down_to

    #------------------------------- Extensions ------------------------------#
    def parse_primers(self, *args, **kwargs):
        fwd_gen = self.fwd.parse_primers(*args, **kwargs)
        rev_gen = self.rev.parse_primers(*args, **kwargs)
        generator = izip(fwd_gen, rev_gen)
        return GenWithLength(generator, len(fwd_gen))
Exemple #50
class UclustOTUs(OTUs):
    """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file

    short_name = 'uclust'
    title = 'UCLUST-QIIME denovo picking'

    all_paths = """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.all_otus = FilePath(self.p.all_otus)
        self.all_centers = FASTA(self.p.all_centers)
        self.otus = FilePath(self.base_dir + "otus.txt")
        self.centers = FASTA(self.base_dir + "centers.fasta")
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Clean #
        # Run command #
        pick_otus = sh.Command('pick_otus.py')
        pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir)
        # Move into place #
        base_name = self.p.clusters_dir + self.reads.prefix
        shutil.move(base_name + '_otus.txt', self.all_otus)
        shutil.move(base_name + '_otus.log', self.p.qiime_log)
        shutil.move(base_name + '_clusters.uc', self.p.clusters_uc)
        # Remove OTUs that are only one read #
        def filter_singletons(f):
            for line in f:
                line = line.split()
                if len(line) > 2: yield '\t'.join(line) + '\n'
        # Create the centers file that is missing #
        pick_rep = sh.Command('pick_rep_set.py')
        pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers)
        # Remake the centers file without the filtered OTUs #
        self.otus_to_keep = [line.split()[0] for line in self.otus]
        def filter_otus(f):
            for seq in f:
                if seq.id in self.otus_to_keep: yield seq

    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.otus:
            # Parse the line #
            contents = line.split()
            otu, reads = contents[0], contents[1:]
            # Parse the hits #
            for r in reads:
                nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                if nums:
                    run_num, pool_num, sample_num, read_num = map(int, nums[0])
                    sample = illumitag.runs[run_num][pool_num-1][sample_num-1]
                    name = sample.short_name
                    nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                    run_num, sample_num, read_num = map(int, nums[0])
                    sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0]
                    name = sample.short_name
                # Count #
                result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result
Exemple #51

# Built-in modules #
import inspect, os

# Internal modules #
from seqsearch.databases.ncbi_16s import ncbi_16s
from seqsearch.search.blast import BLASTquery

# First party modules #
from fasta import FASTA

# Get current directory #
file_name = inspect.getframeinfo(inspect.currentframe()).filename
this_dir = os.path.dirname(os.path.abspath(file_name)) + '/'

if __name__ == "__main__":

    # Main input #
    seqs = FASTA(this_dir + 'seqs.fasta')

    # The database to search against #
    db = ncbi_16s.blast_db

    # Create search #
    query = BLASTquery(seqs, db)

    # Run #
Exemple #52
class UparseOTUs(OTUs):
    """Will use uparse to create OTU clusters from a given FASTA file

    short_name = 'uparse'
    title = 'UPARSE denovo picking'
    article = "http://www.nature.com/doifinder/10.1038/nmeth.2604"
    version = uparse_version
    threshold = 3.0

    all_paths = """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return 0

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.derep = SizesFASTA(self.p.derep)
        self.sorted = SizesFASTA(self.p.sorted)
        self.centers = FASTA(self.p.centers)
        self.readmap = UClusterFile(self.p.readmap)
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod',   self.p.silva_dir)
        self.taxonomy_fw    = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        self.taxonomy_unite = CrestTaxonomy(self.centers, self, 'unite',      self.p.unite_dir)
        self.taxonomy_rdp   = RdpTaxonomy(self.centers, self)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva
        # Source tracking #
        self.seqenv = Seqenv(self)

    def run(self, threshold=None):
        # Optional threshold #
        if threshold is None: threshold = self.threshold
        identity = (100 - threshold) / 100
        # Dereplicate (uparse version 32bit version runs out of memory) #
        if False: sh.usearch7("--derep_fulllength", self.reads, '-output', self.derep, '-sizeout')
        sh.fasta_make_unique(self.reads, self.derep)
        # Order by size and kill singeltons #
        sh.usearch7("--sortbysize", self.derep, '-output', self.sorted, '-minsize', 2)
        # Compute the centers #
        sh.usearch7("--cluster_otus", self.sorted, '-otus', self.centers, '-otu_radius_pct', threshold)
        # Rename the centers #
        # Map the reads back to the centers #
        sh.usearch7("-usearch_global", self.reads, '-db', self.centers, '-strand', 'plus', '-id', identity, '-uc', self.readmap)

    def checks(self):
        assert len(self.reads) == len(self.derep)
        assert len(self.reads) == len(self.readmap)

    def cluster_counts_table(self):
        """Parse that custom output for creating the unfiltered OTU table"""
        result = pandas.DataFrame(self.readmap.otu_sample_counts)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result
Exemple #53
"""We explore the client given inputs, check for problems,
then format them and store them in the repository as immutable artifacts
(compressed text files)"""

import inspect, os, glob, pandas
from fasta import FASTA

current_script = inspect.getframeinfo(inspect.currentframe()).filename
current_dir = os.path.dirname(os.path.abspath(current_script)) + '/'
genomes_dir = current_dir + '../ld12/data/genomes/'

input_dir = "/proj/b2013274/mcl/"
faa_paths = sorted(glob.glob(input_dir + '*.faa'))
fna_paths = sorted(glob.glob(input_dir + '*.fna'))
faas = [FASTA(faa) for faa in faa_paths if '647533246' not in faa]
fnas = [FASTA(fna) for fna in fna_paths if '647533246' not in fna]

faas_nums = [int(g.short_prefix) for g in faas]
fnas_nums = [int(g.short_prefix) for g in fnas]
metadata = pandas.io.parsers.read_csv(current_dir +
meta_nums = list(metadata.index)

print set(faas_nums) ^ set(fnas_nums)
print set(faas_nums) ^ set(meta_nums)

def strip(seq):
Exemple #54
class CdhitOTUs(OTUs):
    """Will use cd-hit to create OTU clusters from a given FASTQ file

    short_name = "cdhit"
    title = "CD-HIT Illumina OTU picking"

    all_paths = """

    def __repr__(self):
        return "<%s object of %s>" % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + "/"
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main reads file here FASTQ #
        self.reads = FASTQ(self.p.all_reads)
        # Files #
        self.cdhit_clusters = FilePath(self.p.clstr)
        self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU")
        self.centers = FASTA(self.p.centers)
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, "silvamod", self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, "freshwater", self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Combine reads but in fastq format this time #
        paths = [sample.renamed for sample in self.cluster]
        shell_output("cat %s > %s" % (" ".join(paths), self.reads))
        # Clean #
        # Run command #
        cdhit = sh.Command(cdhit_script)
        cdhit("-i", self.reads, "-o", self.p.clusters_dir, "-p", TmpFile.from_string("[ACTG]"))
        # Create the centers file with good names #
        self.cdhit_centers.rename_with_num("OTU-", self.centers)

    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.cdhit_clusters:
            if line.startswith(">"):
                otu = "OTU-%s" % line.split()[1]
            nums = re.findall(">run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line)
            if nums:
                run_num, pool_num, sample_num, read_num = map(int, nums[0])
                sample = illumitag.runs[run_num][pool_num - 1][sample_num - 1]
                name = sample.short_name
                nums = re.findall(">run([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line)
                run_num, sample_num, read_num = map(int, nums[0])
                sample = [
                    for s in illumitag.presamples + illumitag.pyrosamples
                    if s.run_num == run_num and s.num == sample_num
                name = sample.short_name
            # Count #
            result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        # Remove OTUs that are only one read #
        return result
Exemple #55
 def __init__(self, fasta_path, seq_type='nucl' or 'prot'):
     # Check if the FASTA already has the seq_type set #
     if hasattr(fasta_path, 'seq_type'): self.seq_type = fasta_path.seq_type
     else: self.seq_type = seq_type
     # Call parent constructor #
     FASTA.__init__(self, fasta_path)
Exemple #56
    seq = seq.split('[')[0]
    return seq

for faa,fna in zip(faas, fnas):
    faas_genes = [strip(seq) for seq in faa]
    fnas_genes = [strip(seq) for seq in fna]
    print faa, len(set(fnas_genes) ^ set(faas_genes)), "discrepancies"
    #print "- in fna but not in faa:", [x for x in set(fnas_genes) - set(faas_genes)]
    #print "- in faa but not in fna:", [x for x in set(faas_genes) - set(fnas_genes)]
    #print ""

fnas_genes = [strip(seq) for fna in fnas for seq in fna]
print len(fnas_genes), len(set(fnas_genes))

for genome in faas:
    out_path = genomes_dir + genome.short_prefix + '.fasta'
    out_fasta = FASTA(out_path)
    for seq in genome: out_fasta.add_str(str(seq.seq), strip(seq))

def lines():
    for genome in faas:
        for gene in genome:
            name = strip(gene)
            yield name + '\t' + gene.description[len(name):].rstrip(' |') + '\n'

annotations_path = current_dir + '../ld12/data/annotations.tsv'
with open(annotations_path, 'w') as handle: handle.writelines(lines())
Exemple #57
 def subsampled(self):
     subsampled = FASTA(self.p.subsampled)
     if not subsampled.exists:
         self.fasta.subsample(down_to=30, new_path=subsampled)
     return subsampled
Exemple #58
class Pyrosample(object):
    """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology."""

    all_paths = """

    kind = "pyrosample"

    def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name)

    def __init__(self, json_path, out_dir):
        # Attributes #
        self.out_dir = out_dir
        self.json_path = FilePath(json_path)
        # Parse #
        self.info = load_json_path(self.json_path)
        # Basic #
        self.account = "/dev/null"
        self.run_num = self.info['run_num']
        self.run_label = "pyrosample_run_%i" % self.run_num
        self.project_short_name = self.info['project']
        self.project_long_name = self.info['project_name']
        # Own attributes #
        self.num = self.info['sample_num']
        self.short_name = self.info['sample']
        self.long_name = self.info['sample_name']
        self.name = 'run%i_sample%i' % (self.run_num, self.num)
        self.group = self.info['group']
        self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
        # Hard coded attributes #
        self.machine = "454 GS FLX Titanium"
        # SFF files #
        self.sff_files_info = self.info['files']
        # Pool dummy #
        self.pool, self.parent = self, self
        # Other dummy variables #
        self.bar_len = 0
        self.gzipped = False
        self.used = True
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Check files are there #
        for f in self.sff_files_info:
            if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path'])
        # Automatic paths #
        self.base_dir = self.out_dir + self.id_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Make an alias to the json #
        self.p.info_json.link_from(self.json_path, safe=True)
        # Primer #
        self.primer_regex = re.compile(self.info['primer'])
        # Raw files #
        self.raw_fasta = FASTA(self.p.raw_fasta)
        self.raw_fastq = FASTQ(self.p.raw_fastq)
        # Standard FASTA #
        self.reads = FASTA(self.p.reads_fasta)
        self.fasta = FASTA(self.p.renamed)
        # Special FASTQ #
        self.fastq = FASTQ(self.p.reads_fastq)
        # A shameless hack for cdhit to work #
        self.renamed = self.fastq
        # Pre-denoised special case #
        if self.info['predenoised'] and False:
            self.sff_files_info = []
            self.reads.link_from(self.info['predenoised'], safe=True)
        # Special submission attributes #
        self.sra = PyroSampleSRA(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    def mate(self):
        if not 'mate' in self.info: return False
        run_num = self.info['mate']['run']
        pool_num = self.info['mate']['pool']
        barcode_num = self.info['mate']['num']
        return illumitag.runs[run_num][pool_num-1][barcode_num-1]

    def extract(self):
        # Call extraction #
        shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
        shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
        shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
        # Convert #
        sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)

    def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20):
        for read in reads:
            # Length #
            if len(read) < minlength: continue
            # Primer #
            match = self.primer_regex.search(str(read.seq))
            if not match: continue
            # PHRED score #
            scores = read.letter_annotations["phred_quality"]
            averaged = moving_average(scores, windowsize)
            discard = False
            for i,value in enumerate(averaged):
                if value < threshold:
                    read = read[:i+windowsize-1]
                    if len(read) < minlength: discard = True
            if discard: continue
            # Undetermined bases #
            if 'N' in read: continue
            # Remove primer #
            read = read[match.end():]
            # Flip them because 454 reads the other end #
            read = read.reverse_complement()
            # Return #
            yield read

    def clean(self, **kwargs):
        self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs))

    def report_loss(self):
        print "Before cleaning: %i" % len(self.raw_fastq)
        print "After cleaning: %i" % len(self.reads)
        print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads))))

    def process(self):
        self.reads.rename_with_num(self.name + '_read', new_path=self.fasta)

    def make_fastq(self, **kwargs):
        """In some special cases we want the FASTQ"""
        self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs))
        self.fastq.rename_with_num(self.name + '_read')
        print "make_fastq for sample %s completed" % self.id_name
Exemple #59
 def seeds(self):
     seeds = FASTA(self.autopaths.seed)
     return seeds
Exemple #60
def FASTA_alignment():
    # example for talk
    f = FASTA.retrieve('1YGV', cache_dir) + FASTA.retrieve('3HQV', cache_dir)
    sa = SequenceAligner.from_FASTA(f)