Ejemplo n.º 1
0
 def __init__(self, fwd, rev, parent=None):
     # FASTA objects #
     self.fwd = FASTA(fwd)
     self.rev = FASTA(rev)
     # Extra #
     self.gzipped = self.fwd.gzipped
     self.parent = parent
Ejemplo n.º 2
0
def generate_values(path, progress=False):
    seqs = SeqIO.parse(path, 'fasta')
    if not progress:
        for seq in seqs: yield (seq.id, seq.description, str(seq.seq))
    if progress:
        for seq in tqdm(GenWithLength(seqs, len(FASTA(path)))):
            yield (seq.id, seq.description, str(seq.seq))
Ejemplo n.º 3
0
 def __init__(
     self,
     query_path,  # The input sequences
     db_path=pfam.hmm_db,  # The database to search
     seq_type='prot' or 'nucl',  # The seq type of the query_path file
     e_value=0.001,  # The search threshold
     params=None,  # Add extra params for the command line
     out_path=None,  # Where the results will be dropped
     executable=None,  # If you want a specific binary give the path
     cpus=None):  # The number of threads to use
     # Save attributes #
     self.query = FASTA(query_path)
     self.db = FilePath(db_path)
     self.params = params if params else {}
     self.e_value = e_value
     self.seq_type = seq_type
     self.executable = FilePath(executable)
     # Cores to use #
     if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32)
     else: self.cpus = cpus
     # Auto detect database short name #
     if db_path == 'pfam': self.db = pfam.hmm_db
     if db_path == 'tigrfam': self.db = tigrfam.hmm_db
     # Output #
     if out_path is None:
         self.out_path = FilePath(self.query.prefix_path + '.hmmout')
     elif out_path.endswith('/'):
         self.out_path = FilePath(out_path + self.query.prefix + '.hmmout')
     else:
         self.out_path = FilePath(out_path)
 def to_fasta(self, path, verbose=False):
     # Select verbosity #
     import tqdm
     wrapper = tqdm.tqdm if verbose else lambda x: x
     # Do it #
     with open(path, 'w') as handle:
         for r in wrapper(self): SeqIO.write(r, handle, 'fasta')
     # Return #
     return FASTA(path)
Ejemplo n.º 5
0
 def fresh_fasta(self):
     """A file containing all the fresh water genes"""
     fasta = FASTA(self.p.fresh_fasta)
     if not fasta.exists:
         print "Building fasta file with all fresh genes..."
         fresh = [g for g in genomes.values() if g.fresh]
         shell_output('gunzip -c %s > %s' % (' '.join(fresh), fasta))
         assert len(fasta) == sum(map(len, fresh))
         self.timer.print_elapsed()
     return fasta
Ejemplo n.º 6
0
 def fasta(self):
     """The fasta file containing the filtered genes of this cluster
     The names now will correspond to long descriptive names"""
     fasta = FASTA(self.p.fasta)
     if not fasta:
         fasta.create()
         for gene in self.filtered_genes:
             fasta.add_str(str(gene), name=gene.name)
         fasta.close()
     return fasta
Ejemplo n.º 7
0
 def __init__(self, version, seq_type, base_dir=None):
     # Attributes #
     self.version    = version
     self.seq_type   = seq_type
     self.short_name = self.short_name + "_" + self.version
     # Base directory #
     if base_dir is None: base_dir = home
     self.base_dir = base_dir + 'databases/' + self.short_name + '/'
     self.p        = AutoPaths(self.base_dir, self.all_paths)
     # URL #
     self.url  = "release_%s/Exports/"  % self.version
     # The database #
     self.nr99_name = "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz" % self.version
     self.nr99_dest = FASTA(self.base_dir + self.nr99_name)
     self.nr99      = FASTA(self.base_dir + self.nr99_name[:-3])
     # The alignment #
     self.aligned_name = "SILVA_%s_SSURef_Nr99_tax_silva_full_align_trunc.fasta.gz" % self.version
     self.aligned_dest = FASTA(self.base_dir + self.aligned_name)
     self.aligned      = FASTA(self.base_dir + self.aligned_name[:-3])
Ejemplo n.º 8
0
 def fasta(self):
     """Make a fasta file with all uniprot proteins that are related to
     this family."""
     fasta = FASTA(self.p.proteins)
     if not fasta.exists:
         fasta.create()
         for seq in pfam.fasta:
             if self.fam_name in seq.description: fasta.add_seq(seq)
         fasta.close()
         assert fasta
     # Return #
     return fasta
Ejemplo n.º 9
0
    def read_file(self, fp):
        '''
        Read the first FASTA record from the content of fp,
        and set the chromosome name and sequence using set_chromosome method.
        '''
        if self.verbose:
            print >> stderr, "reading a FASTA record to set a chromosome"
        fasta = FASTA(fp=fp, verbose=self.verbose)
        chr_name, chr_seq = fasta.get_record()

        if chr_name and chr_seq:
            chr_name = chr_name[1:]
            self.set_chromosome(chr_name, chr_seq)
        elif not chr_name and not chr_seq:
            raise NoChromosomeFoundError(fp.name, chr_name, chr_seq)
        else:
            raise ChromosomeFASTAFromatError(fp.name, chr_name, chr_seq)
Ejemplo n.º 10
0
def main():

    args = parse_args()
    dihedrals = read_dihedrals()

    fasta = FASTA(args.fasta)
    fasta.read()
    (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta)
    #grooves = readGrooves(args.grooves, mhcSeq, peptides)
    universalGrooves = universalGroove(args.grooves, mhcSeq, peptides)
    intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides)
    #for u in universalGrooves:
    #    print (u, universalGrooves[u])

    #for u in intersectGrooves:
    #    print (intersectGrooves[u])

    labels = read_rmsd_file(args.rms)
    pdbids = read_datafile(args.t)

    outputfilehandler = open(args.pdbids, 'w')

    for pdbid in pdbids:
        if pdbid in dihedrals:
            if args.pep:
                finalSeqCode = oneHotEncoding(peptides[pdbid])
                finalLabelCode = dihedrals[pdbid]
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                    outputfilehandler.write(pdbid + '\n')
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
                    outputfilehandler.write(pdbid + '\n')
            else:
                finalSeqCode = oneHotEncoding(universalGrooves[pdbid] +
                                              peptides[pdbid])
                finalLabelCode = dihedrals[pdbid]
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                    outputfilehandler.write(pdbid + '\n')
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
                    outputfilehandler.write(pdbid + '\n')

    outputfilehandler.close()
Ejemplo n.º 11
0
def read_fasta(args):

    fasta = FASTA(args.fasta)
    fasta.read()
    headers = fasta.get_headers()
    pep_chain = {}
    pep_seq = {}

    for header in headers:
        fields = header.split('|')
        pdbid = fields[0]
        chainid = fields[1]
        seq = fasta.get_sequence(header)

        if len(seq) == 9:
            pep_chain[pdbid] = chainid
            pep_seq[pdbid] = seq

    return (pep_chain, pep_seq)
Ejemplo n.º 12
0
 def __init__(self,
              query_path,
              db_path,
              seq_type     = 'prot' or 'nucl',     # The seq type of the query_path file
              params       = None,                 # Add extra params for the command line
              algorithm    = "blastn" or "blastp", # Will be auto-determined with seq_type
              out_path     = None,                 # Where the results will be dropped
              executable   = None,                 # If you want a specific binary give the path
              cpus         = None,                 # The number of threads to use
              num          = None,                 # When parallelized, the number of this thread
              _out         = None,                 # Store the stdout at this path
              _err         = None):                # Store the stderr at this path
     # Main input #
     self.query = FASTA(query_path)
     # The database to search against #
     self.db = FilePath(db_path)
     # Other attributes #
     self.seq_type     = seq_type
     self.algorithm    = algorithm
     self.num          = num
     self.params       = params if params else {}
     # The standard output and error #
     self._out         = _out
     self._err         = _err
     # Output defaults #
     if out_path is None:
         self.out_path = self.query.prefix_path + self.extension
     elif out_path.endswith('/'):
         self.out_path = out_path + self.query.prefix + self.extension
     else:
         self.out_path = out_path
     # Make it a file path #
     self.out_path = FilePath(self.out_path)
     # Executable #
     self.executable = FilePath(executable)
     # Cores to use #
     if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32)
     else:            self.cpus = cpus
     # Save the output somewhere #
     if self._out is True:
         self._out = self.out_path + '.stdout'
     if self._err is True:
         self._err = self.out_path + '.stderr'
Ejemplo n.º 13
0
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    fasta.read()
    (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta)
    #grooves = readGrooves(args.grooves, mhcSeq, peptides)
    universalGrooves = universalGroove(args.grooves, mhcSeq, peptides)
    intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides)
    labels = read_rmsd_file(args.rms)
    pdbids = read_datafile(args.t)

    aaindex = Aaindex()
    #for result in aaindex.search('charge'):
    #    print(result)

    record = aaindex.get('FASG890101')
    #print (record.title)
    index_data = record.index_data
    #print (index_data)

    charge = aaindex.get('KLEP840101')
    charge_data = charge.index_data
    #print (charge_data)

    for l in labels:
        (pdbid1, pdbid2) = l.split('_')
        #if pdbid1 in pdbids and pdbid2 in pdbids:
        if pdbid1 in pdbids or pdbid2 in pdbids:
            if args.pep:
                finalSeqCode, finalLabelCode = oneHotEncoding(peptides[pdbid1]+'|'+peptides[pdbid2], labels[l], index_data, charge_data)
                if args.label == 'x':
                    print (', '.join(finalSeqCode))
                elif args.label == 'y':
                    print (', '.join(finalLabelCode))
            else:
                finalSeqCode, finalLabelCode = oneHotEncoding(universalGrooves[pdbid1]+peptides[pdbid1]+'|'+universalGrooves[pdbid2]+peptides[pdbid2], labels[l], index_data, charge_data)
                if args.label == 'x':
                    print (', '.join(finalSeqCode))
                elif args.label == 'y':
                    print (', '.join(finalLabelCode))
Ejemplo n.º 14
0
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    fasta.read()
    peptides = totalNineMers(fasta)
    pdbids = peptides.keys()
    testsetlen = int(args.percent * len(pdbids))

    trainset = []
    testset = []
    for i in range(0, len(pdbids)):
        r = random()
        if len(testset) < testsetlen and r < 0.5:
            testset.append(pdbids[i])
        else:
            trainset.append(pdbids[i])

    write_to_file('train/90_10/train.txt', trainset)
    write_to_file('test/90_10/test.txt', testset)
Ejemplo n.º 15
0
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    fasta.read()
    peptides, alleles = totalNineMers(fasta)
    pdbids = peptides.keys()
    testsetlen = int(args.percent * len(pdbids))

    trainset = []
    testset = []
    for p in pdbids:
        r = random()
        if len(testset) < testsetlen and r < 0.5 and alleles[p] == 'A0201':
            testset.append(p)
        else:
            trainset.append(p)

    write_to_file('train.txt', trainset)
    write_to_file('test.txt', testset)
Ejemplo n.º 16
0
 def __init__(self, path, num_parts=None, part_size=None, base_dir=None):
     # Basic #
     self.path = path
     # Directory #
     if base_dir is None: self.base_dir = path + '.parts/'
     else: self.base_dir = base_dir
     # Num parts #
     if num_parts is not None: self.num_parts = num_parts
     # Evaluate size #
     if part_size is not None:
         self.bytes_target = humanfriendly.parse_size(part_size)
         self.num_parts = int(
             math.ceil(self.count_bytes / self.bytes_target))
     # Make parts #
     self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i
     self.parts = [
         FASTA(self.make_name(i)) for i in range(1, self.num_parts + 1)
     ]
     # Give a number to each part #
     for i, part in enumerate(self.parts):
         part.num = i
Ejemplo n.º 17
0
def main():

    args = parse_args()

    fasta = FASTA(args.fasta)
    fasta.read()
    (peptides, mhcSeq, mhcAllele) = totalNineMers(fasta)
    #grooves = readGrooves(args.grooves, mhcSeq, peptides)
    universalGrooves = universalGroove(args.grooves, mhcSeq, peptides)
    intersectGrooves = IntersectionGroove(args.grooves, mhcSeq, peptides)
    #for u in universalGrooves:
    #    print (u, universalGrooves[u])

    #for u in intersectGrooves:
    #    print (intersectGrooves[u])

    labels = read_rmsd_file(args.rms)
    pdbids = read_datafile(args.t)

    for l in labels:
        (pdbid1, pdbid2) = l.split('_')
        #if pdbid1 in pdbids and pdbid2 in pdbids:
        if pdbid1 in pdbids or pdbid2 in pdbids:
            if args.pep:
                finalSeqCode, finalLabelCode = oneHotEncoding(
                    peptides[pdbid1] + '|' + peptides[pdbid2], labels[l])
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
            else:
                finalSeqCode, finalLabelCode = oneHotEncoding(
                    universalGrooves[pdbid1] + peptides[pdbid1] + '|' +
                    universalGrooves[pdbid2] + peptides[pdbid2], labels[l])
                if args.label == 'x':
                    print(', '.join(finalSeqCode))
                elif args.label == 'y':
                    print(', '.join(finalLabelCode))
Ejemplo n.º 18
0
 def test(self):
     """Search one sequence, and see if it works."""
     # New directory #
     directory = new_temp_dir()
     # A randomly chosen sequence (H**o sapiens mRNA for prepro cortistatin) #
     seq = """ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC
     CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC
     CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG
     AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC
     CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG
     TTTAATTACAGACCTGAA"""
     seq = seq.replace('\n','')
     seq = seq.replace(' ','')
     # Make input #
     input_fasta = FASTA(directory + 'input.fasta')
     input_fasta.create()
     input_fasta.add_str(seq, "My test sequence")
     input_fasta.close()
     # Make output #
     out_path = directory + 'output.blast'
     # Make extras parameters #
     params = {'-outfmt': 0,
               '-evalue': 1e-5,
               '-perc_identity': 99}
     # Make the search #
     search = SeqSearch(input_fasta,
                        self.blast_db,
                        'nucl',
                        'blast',
                        num_threads = 1,
                        out_path    = out_path,
                        params      = params)
     # Run it #
     search.run()
     # Print result #
     print("Success", directory)
Ejemplo n.º 19
0
 def seeds(self):
     seeds = FASTA(self.autopaths.seed)
     return seeds
Ejemplo n.º 20
0
 def fasta(self):
     fasta = FASTA(self.autopaths.fasta)
     return fasta
Ejemplo n.º 21
0
 def subsampled(self):
     subsampled = FASTA(self.p.subsampled)
     if not subsampled.exists:
         self.fasta.subsample(down_to=30, new_path=subsampled)
         self.add_taxonomy(subsampled)
     return subsampled
Ejemplo n.º 22
0
    'TTDB/TriTrypDB-46_TcruziCLBrenerEsmeraldo-like_AnnotatedCDSs.fasta',
    'organism': 'TcruziCLBrenerEsmeraldo-like'
}
non_emeraldo = {
    'genome_filename':
    'TTDB/TriTrypDB-46_TcruziCLBrenerNon-Esmeraldo-like_Genome.fasta',
    'regions_filename':
    'TTDB/TriTrypDB-46_TcruziCLBrenerNon-Esmeraldo-like_AnnotatedCDSs.fasta',
    'organism': 'TcruziCLBrenerNon-Esmeraldo-like'
}

organism = emeraldo_like

if __name__ == "__main__":
    # Load FASTA files
    genome = FASTA(organism['genome_filename'])
    genome.load()

    regions = FASTA(organism['regions_filename'])
    regions.load()

    # Load database file
    sqlite = sqlite3.connect(SQLite_DB)

    # Create MFASeq Folder
    Organism_MFASeq_folder = f"{MFASeq_folder}/MFA-Seq_{organism['organism']}"
    if not os.path.isdir(Organism_MFASeq_folder):
        os.mkdir(Organism_MFASeq_folder)

    # Create MFASeq Files
    for chromosome_id in genome.data.keys():
Ejemplo n.º 23
0
                    "--basepairs",
                    action='store_true',
                    help="Use base pairs instead of genome counts")

args = parser.parse_args()

protein_fasta = args.fasta or '/home/seijihariki/Documents/TCC/TTDB/TriTrypDB-46_TcruziCLBrenerEsmeraldo-like_AnnotatedTranscripts.fasta'
simulation_folder = args.simulation
search = args.search or 'DGF-1'
base_pairs = args.basepairs

simulation_cnt = args.count or 50
chromosomes_cnt = args.chromosomes or 41

print('Loading annotations:')
transcripts = FASTA(protein_fasta)
transcripts.load()

collisions = {}

print('Detecting collisions:')
for chromosome in range(chromosomes_cnt):
    chromosome_name = f"TcChr{chromosome + 1}-S"
    collisions[chromosome_name] = []

    for simulation in range(simulation_cnt):
        with open(
                f"{simulation_folder}simulation_{simulation}/{chromosome_name}.cseq"
        ) as times:
            start, end = -2, -2
            current_location = 0
Ejemplo n.º 24
0
"""

# Built-in modules #
import inspect, os

# Internal modules #
from seqsearch.databases.ncbi_16s import ncbi_16s
from seqsearch.search.blast import BLASTquery

# First party modules #
from fasta import FASTA

# Get current directory #
file_name = inspect.getframeinfo(inspect.currentframe()).filename
this_dir = os.path.dirname(os.path.abspath(file_name)) + '/'

###############################################################################
if __name__ == "__main__":

    # Main input #
    seqs = FASTA(this_dir + 'seqs.fasta')

    # The database to search against #
    db = ncbi_16s.blast_db

    # Create search #
    query = BLASTquery(seqs, db)

    # Run #
    query.run()
Ejemplo n.º 25
0
"""We explore the client given inputs, check for problems,
then format them and store them in the repository as immutable artifacts
(compressed text files)"""

import inspect, os, glob, pandas
from fasta import FASTA

current_script = inspect.getframeinfo(inspect.currentframe()).filename
current_dir = os.path.dirname(os.path.abspath(current_script)) + '/'
genomes_dir = current_dir + '../ld12/data/genomes/'

input_dir = "/proj/b2013274/mcl/"
faa_paths = sorted(glob.glob(input_dir + '*.faa'))
fna_paths = sorted(glob.glob(input_dir + '*.fna'))
faas = [FASTA(faa) for faa in faa_paths if '647533246' not in faa]
fnas = [FASTA(fna) for fna in fna_paths if '647533246' not in fna]

faas_nums = [int(g.short_prefix) for g in faas]
fnas_nums = [int(g.short_prefix) for g in fnas]
metadata = pandas.io.parsers.read_csv(current_dir +
                                      '../ld12/data/metadata.tsv',
                                      sep='\t',
                                      index_col=0,
                                      encoding='utf-8')
meta_nums = list(metadata.index)

print set(faas_nums) ^ set(fnas_nums)
print set(faas_nums) ^ set(meta_nums)


def strip(seq):
Ejemplo n.º 26
0
from fasta import FASTA, AlignedFASTA
community = FASTA('community.fasta')
alignment = AlignedFASTA('alignment.fasta')
Ejemplo n.º 27
0
 def all_proteins(self):
     """The main fasta file."""
     return FASTA(self.p.unzipped_proteins)
Ejemplo n.º 28
0
 def to_fasta(self, path):
     with open(path, 'w') as handle:
         for r in self:
             SeqIO.write(r, handle, 'fasta')
     return FASTA(path)