Ejemplo n.º 1
0
    def scan_sequences(self, seq_file, threshold=0., only_best=False):
        """
        Apply the TFFM on the fasta sequences and return the TFBS hits.

        :arg seq_file: Fasta file giving the DNA sequences to apply the TFFM
            on.
        :type seq_file: str
        :arg threshold: The threshold used to predict a hit (i.e. the minimal
            probability value for a position to be considered a TFBS hit)
            (default: 0.0).
        :type threshold: float
        :arg only_best: Argument to be set to :class:`True` if only the best
            TFBS hit per sequence is to be reported (default: :class:`False`)
        :type only_best: bool
        :returns: TFBS hits through a generator.
        :rtype: :class:`Generator` of :class:`HIT`

        :note:  (**0.0<=** *threshold* **<=1.0**)

        """

        sequence_list = utils.parse_fasta(seq_file)
        for seq_record in sequence_list:
            hits = self.scan_sequence(seq_record, threshold, only_best)
            for hit in hits:
                yield hit
Ejemplo n.º 2
0
    def pocc_sequences(self, seq_file, threshold=0.):
        """
        Apply the TFFM on the fasta sequences and return the Pocc value
        (probability of occupancy) for each sequence.

        :arg seq_file: Fasta file giving the DNA sequences to apply the TFFM
            on.
        :type seq_file: str
        :arg threshold: The threshold used to predict hits that will be used to
            compute the Pocc (default: 0.0).
        :type threshold: float

        :returns: Pocc values through a generator.
        :rtype: :class:`Generator` of :class:`HIT`

        :note: (**0.0<=** *threshold* **<=1.0**)

        """

        sequence_list = utils.parse_fasta(seq_file)
        for seq_record in sequence_list:
            pocc = 1.
            for hit in self.scan_sequence(seq_record, threshold, False):
                if hit:
                    pocc *= (1. - hit.score)
            yield hit_module.HIT(seq_record, 1, len(seq_record), None, pocc,
                                 self, None)
Ejemplo n.º 3
0
def retrieve_taxonomy(prefix):
  Entrez.email = "*****@*****.**"
  genbankACCRegex = re.compile(r'([A-Z]{2}[0-9]{8}|[A-Z]{2}[0-9]{6}|[A-Z][0-9]{5}|NC_[0-9]{6})')
  accessionIDs = [header for header, sequence in utils.parse_fasta(sequenceFile)]

  validIDs = list(filter(genbankACCRegex.match, accessionIDs))
  invalidIDs = [x for x in accessionIDs if x not in validIDs]

  while 1:
    try:
      handle = Entrez.elink(dbfrom='nuccore', db='taxonomy', id=validIDs, idtype='acc', rettype='xml')
      record = Entrez.read(handle)
      handle.close()
    except RuntimeError:
      time.sleep(5)
      continue
    break


  taxIDs = { str(x['IdList'][0]).split('.')[0] : x['LinkSetDb'][0]['Link'][0]['Id'] if x['LinkSetDb'] else "XXXXX" for x in record }
  values = list(set(taxIDs.values()))

  target_handle = Entrez.efetch(db='taxonomy', id=','.join(values), retmode='xml')
  target_record = Entrez.read(target_handle)
  target_handle.close()

  accID2species = get_clade_by_rank('species', taxIDs, target_record)
  accID2genus = get_clade_by_rank('genus', taxIDs, target_record)

  accID2GBdescription = {}
  nuccore_handle = Entrez.efetch(db='nuccore', id=','.join(list(accID2species.keys())), rettype='gb', retmode='xml')
  nuccore_records = Entrez.read(nuccore_handle)

  for record in nuccore_records:
    country = collectionDate = '--'
    accID = record['GBSeq_primary-accession'].split('.')[0]
    featureTable = record['GBSeq_feature-table'][0]['GBFeature_quals']
    for feature in featureTable:
      if feature['GBQualifier_name'] == 'country':
        country = feature['GBQualifier_value']
      if feature['GBQualifier_name'] == 'collection_date':
        collectionDate = feature['GBQualifier_value']
    accID2GBdescription[accID] = f"{accID2species[accID]},{accID2genus[accID]},{record['GBSeq_definition'].replace(' ','_')},{country.replace(' ','_')},{collectionDate.replace(' ','_')}"

  while len(accID2GBdescription) != len(accID2species):
    remainder = [acc for acc in accID2species if acc not in accID2GBdescription]
    time.sleep(1)
    nuccore_handle = Entrez.efetch(db='nuccore', id=','.join(remainder), rettype='gb', retmode='xml')
    nuccore_records = Entrez.read(nuccore_handle)
    for record in nuccore_records:
      country = collectionDate = '--'
      accID = record['GBSeq_primary-accession'].split('.')[0]
      featureTable = record['GBSeq_feature-table'][0]['GBFeature_quals']
      for feature in featureTable:
        if feature['GBQualifier_name'] == 'country':
          country = feature['GBQualifier_value']
        if feature['GBQualifier_name'] == 'collection_date':
          collectionDate = feature['GBQualifier_value']
      accID2GBdescription[accID] = f"{accID2species[accID]},{accID2genus[accID]},{record['GBSeq_definition'].replace(' ','_')},{country.replace(' ','_')},{collectionDate.replace(' ','_')}"
  return accID2GBdescription
def split_assembled_genome(gtf_path, fasta_path, od='.', L=98):
    trs = parse_gtf(gtf_path)
    print('GTF parsed')
    scaffolds = parse_fasta(fasta_path)
    print('Scaffolds parsed')

    wrong_scaffolds = 0
    for tr, data in trs.items():
        processed = []
        unprocessed = []
        if data['scaffold'] in scaffolds:
            sequence = scaffolds[data['scaffold']]
        else:
            print(f'{data["scaffold"]} not in FASTA file {fasta_path}')
            wrong_scaffolds += 1
            continue
        for exon in data['exons']:
            processed.append((int(exon['start']) - 1, int(exon['end']) - 1))
            unprocessed.append(
                (int(exon['start']) - L, int(exon['end']) + L - 2))

        processed = list(merge_intervals(processed))
        unprocessed = merge_intervals(unprocessed)

        processed = ''.join(map(lambda iv: sequence[iv[0]:iv[1]], processed))
        splice_junctions = []
        for iv in unprocessed:
            if iv[1] - iv[0] < 3 * L - 3:
                # If the length of the exon is < L-1
                splice_junctions.append(iv)
            else:
                splice_junctions.append((iv[0], iv[0] + 2 * L - 2))
                splice_junctions.append((iv[1] - (2 * L) + 2, iv[1]))

        splice_junctions = [
            collapse_N(sequence[iv[0]:iv[1]].upper())
            for iv in splice_junctions
        ]

        processed = collapse_N(processed).upper()
        if data['strand'] == '-':
            processed = reverse_complement(processed)
            splice_junctions = map(reverse_complement, splice_junctions)

        with open(f'{od}/processed_transcripts.fasta', 'a') as fh:
            fh.write(f'>{tr}\n')
            fh.write('\n'.join(
                [processed[i:i + 80] for i in range(0, len(processed), 80)]))
            fh.write('\n')

        with open(f'{od}/splice_junctions.fasta', 'a') as fh:
            for i, sj in enumerate(splice_junctions):
                fh.write(f'>{tr}:{i}\n')
                fh.write(f'{sj}\n')
    print('DONE!')
    print(
        f'{wrong_scaffolds} scaffolds were not found, and the corresponding annotations were ignored.'
    )
def calculate_gc_percentage(data):
    gene_dict = parse_fasta(data)

    gc_dict = {}
    for k, v in gene_dict.items():
        gc_dict[k] = gc_percentage(v)

    max_key = max(gc_dict, key=lambda k: gc_dict[k])

    return ''.join(max_key + '\n' + str(gc_dict[max_key]))
def overlap_graph(data):
    gene_dict = parse_fasta(data)

    tup_list = []
    for k, v in gene_dict.items():
        for kj, kv in gene_dict.items():
            if v[-3:] == kv[:3] and k != kj:
                tup_list.append((k, kj))

    result = ''
    for x in tup_list:
        result += '{0} {1}\n'.format(x[0], x[1])

    return result
Ejemplo n.º 7
0
def reseed_chrom(organism, accession) :
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=" + accession + "&rettype=fasta&retmode=text"
    resource = urllib.request.urlopen(url)
    content =  resource.read().decode("utf-8")

    fasta = parse_fasta(content)

    spec = { 'scientific name' : organism}
    taxon = tax_collect.find_one(spec)
    taxon_id = ""
    if taxon :
        taxon_id = taxon['id']

    builder = SequencePageBuilder(seq_collect, organism, taxon_id, "", 10000)
    builder.purge(accession)
    builder.process(accession, fasta['body'], False)
Ejemplo n.º 8
0
def get_fasta_seqs(ids):
    """Get fasta sequences from protein IDs on uniprot.

    Args:
        ids: list of protein IDs

    Return:
        a dict of fasta sequences, with key: protein id and value: sequence
    """

    url_template = "http://www.uniprot.org/uniprot/{}.fasta"
    fasta_seqs = {}

    for id in ids:
        r = requests.get(url_template.format(id))
        data = r.text.split("\n")[:-1]
        fasta_sequence = parse_fasta(data)
        fasta_seqs[id] = next(iter(fasta_sequence.values()))

    return fasta_seqs
def consensus_and_profile(data):
    gene_dict = parse_fasta(data)

    for v in gene_dict.values():
        length = len(v)
        break

    profile = {
        'A': [0] * length,
        'T': [0] * length,
        'C': [0] * length,
        'G': [0] * length
    }

    for v in gene_dict.values():
        for i in range(len(v)):
            if v[i] == 'A':
                profile['A'][i] += 1
            if v[i] == 'T':
                profile['T'][i] += 1
            if v[i] == 'G':
                profile['G'][i] += 1
            if v[i] == 'C':
                profile['C'][i] += 1

    profile_str = 'A: %s\nC: %s\nG: %s\nT: %s\n' % \
                  (' '.join(str(x) for x in profile['A']),
                   ' '.join(str(x) for x in profile['C']),
                   ' '.join(str(x) for x in profile['G']),
                   ' '.join(str(x) for x in profile['T']))

    consensus = ['A'] * length
    for i in range(length):
        consensus[i] = max(profile, key=lambda k: profile[k][i])

    return ''.join(consensus) + '\n' + profile_str
Ejemplo n.º 10
0
                    outputStream.write(f"{acc},--,--,--,--,--\n")
            outputStream.write(f"####################\n\n")
        return (avgClusterPerSpecies, avgClusterPerGenus)


#########################################################################

args = docopt(__doc__)

treeFile = args['<treeFile>']
seqFile = args['<seqFile>']
clusterFile = args['<clusterFile>']
NCBI = bool(args['--ncbi'])
PREFIX = args['--toolName']

allSequences = {header: seq for header, seq in utils.parse_fasta(seqFile)}
cluster, centroids, failbob = utils.parse_clusterFile(clusterFile)

if not failbob:
    failbob = [
        cluster for idx, cluster in cluster.items() if len(cluster) == 1
    ]
#realCluster = {idx : cluster for idx,cluster in cluster.items() if len(cluster) != 1 and idx != '-1'}
realCluster = {
    idx: cluster
    for idx, cluster in cluster.items() if cluster not in failbob
}

if treeFile:
    tree = dendropy.Tree.get(path=treeFile, schema='newick')
    dm = tree.phylogenetic_distance_matrix()
Ejemplo n.º 11
0
# Author: Kevin Lamkiewicz
# Email: [email protected]
"""
"""

import sys
from collections import defaultdict

import utils

inputFile = sys.argv[1]
fastaFile = sys.argv[2]

if len(sys.argv) == 4:
    goiFile = sys.argv[3]
    goiHeader = [header for header, _ in utils.parse_fasta(goiFile)]

outputFile = f"{sys.argv[1]}.clstr"

clusterInfo = defaultdict(list)
fastaContent = {header: seq for header, seq in utils.parse_fasta(fastaFile)}

with open(inputFile, 'r') as inputStream:
    centroids = set()
    for line in inputStream:
        currentArray = line.split()
        centroid = currentArray[0]
        sequence = currentArray[1]
        centroids.add(centroid)
        clusterInfo[centroid].append(sequence)
Ejemplo n.º 12
0
    'TCT': 'S',
    'TTC': 'F',
    'TTT': 'F',
    'TTA': 'L',
    'TTG': 'L',
    'TAC': 'Y',
    'TAT': 'Y',
    'TAA': '*',
    'TAG': '*',
    'TGC': 'C',
    'TGT': 'C',
    'TGA': '*',
    'TGG': 'W',
}

for header, sequence in utils.parse_fasta(sequenceFile):
    positiveStrand = ""
    longestCDS = 0
    strands = [sequence, utils.reverseComplement(sequence)]
    for strand in strands:
        for frame in range(3):
            proteinSequence = ""
            for fragment in range(frame, len(strand), 3):
                codon = strand[fragment:fragment + 3]
                if len(codon) != 3:
                    continue
                try:
                    proteinSequence += codon2aminoacid[codon]
                except KeyError:
                    proteinSequence += 'X'
            matches = regex_orf.findall(proteinSequence)
Ejemplo n.º 13
0
def main():
    with open(sys.argv[1]) as f:
        fastas = utils.parse_fasta(f)
        m = max_gc_content(fastas)
        print "{}\n{}%".format(m[0], m[1] * 100)
Ejemplo n.º 14
0
# Author: Kevin Lamkiewicz
# Email: [email protected]
"""
This script mainly exists, because the output of cd-hit-est sucks :)
"""

import sys

import utils

inputFile = sys.argv[1]
sequenceFile = sys.argv[2]

if len(sys.argv) == 4:
    goiFile = sys.argv[3]
    goiHeader = [header for header, _ in utils.parse_fasta(goiFile)]

originalHeader = sorted(
    [header for header, _ in utils.parse_fasta(sequenceFile)])
truncatedHeader = []
with open(inputFile, 'r') as inputStream:
    for line in inputStream:
        if line.startswith('>'):
            continue
        truncatedHeader.append(line.strip().split('>')[1].split(' ')[0])

truncatedHeader = sorted(truncatedHeader)
assert (len(truncatedHeader) == len(originalHeader))
headerMapping = {x: y for x, y in zip(truncatedHeader, originalHeader)}
assert (len(headerMapping) == len(originalHeader))
Ejemplo n.º 15
0
                       type=str,
                       help='File containing the PSSM profile.')
    group.add_argument('--fasta',
                       type=str,
                       help='File containing the FASTA sequence.')
    parser.add_argument('filename_model',
                        type=str,
                        help='File containing the GOR model.')
    args = parser.parse_args()

    model = np.load(args.filename_model)['model'].item()

    if args.pssm:
        profile = parse_pssm(args.pssm)
    else:
        profile = seq_to_profile(parse_fasta(args.fasta))

    dssp = ''

    for i in range(0, len(profile)):
        half_window_size = int((len(model['-']) - 1) / 2)
        score_H, score_E, score_C = 0, 0, 0
        for j in range(max(0, i - half_window_size),
                       min(i + half_window_size + 1, len(profile))):
            for k in range(0, len(profile[i])):
                score_H += profile[j][k] * model['H'][j - i +
                                                      half_window_size][k]
                score_E += profile[j][k] * model['E'][j - i +
                                                      half_window_size][k]
                score_C += profile[j][k] * model['-'][j - i +
                                                      half_window_size][k]
Ejemplo n.º 16
0
 def test_definition_case(self):
     fasta = list(utils.parse_fasta(self.SAMPLE_DATASET))[0]
     self.assertAlmostEqual(gc_content(fasta), .60919540)
Ejemplo n.º 17
0
    """
    Args:
        sequences: dict of sequences

    Returns:
        a dict with key: sequence name and value: GC content
    """
    res = {}

    for name in sequences:
        sequence = sequences[name]
        gc_content = (sequence.count("G") +
                      sequence.count("C")) / len(sequence) * 100
        res[name] = round(gc_content, 4)

    return res


if __name__ == "__main__":
    path = "../data/GC.txt"

    with open(path) as f:
        data = [s.strip() for s in f.readlines()]

    sequences = parse_fasta(data)
    gc_contents = compute_GC(sequences)

    max_key = max(gc_contents, key=gc_contents.get)
    print(max_key)
    print(gc_contents[max_key])
Ejemplo n.º 18
0
                       help='File containing the FASTA sequence.')
    parser.add_argument(
        '--probs',
        action='store_true',
        help='Output class probabilities together with the prediction.')
    parser.add_argument('filename_model',
                        type=str,
                        help='File containing the model.')
    args = parser.parse_args()

    model = pickle.load(open(args.filename_model, 'rb'))

    if args.pssm:
        profile = np.array(parse_pssm(args.pssm))
    else:
        profile = np.array(seq_to_profile(parse_fasta(args.fasta)))

    dssp = ''

    for i in range(0, len(profile)):
        half_window_size = int((len(model.support_vectors_[0]) / 20 - 1) / 2)
        part1 = np.zeros(20 * max(0, half_window_size - i))
        part2 = np.ndarray.flatten(
            profile[max(0, i - half_window_size):min(i + half_window_size +
                                                     1, len(profile))])
        part3 = np.zeros(20 * max(0, half_window_size -
                                  (len(profile) - i - 1)))
        vec = np.concatenate((part1, part2, part3))
        if args.probs:
            probs = model.predict_proba(vec.reshape(1, -1))[0]
            dssp += ' '.join([str(round(prob, 3)).rjust(5) for prob in probs])
Ejemplo n.º 19
0
    Returns:
        a list of indexes of where the motif is within sequence
    """
    motif_index = 0
    indexes = []

    for i, aa in enumerate(sequence):
        if aa == motif[motif_index]:
            indexes.append(i + 1)
            motif_index += 1

            if motif_index == len(motif):
                break

    return indexes


if __name__ == "__main__":
    path = "../data/SSEQ.txt"
    with open(path) as f:
        data = [s.strip() for s in f.readlines()]

    fasta_seqs = parse_fasta(data)

    _, sequence = max(fasta_seqs.items(), key=lambda x: len(x[1]))
    _, motif = min(fasta_seqs.items(), key=lambda x: len(x[1]))

    indexes = find_spliced_motif(sequence, motif)

    print(" ".join(map(str, indexes)))
Ejemplo n.º 20
0
 def test_definition_case(self):
     fasta_list = utils.parse_fasta(self.SAMPLE_DATASET)
     label, gcc = max_gc_content(fasta_list)
     self.assertEquals(label, 'Rosalind_0808')
     self.assertAlmostEqual(gcc, .60919540)