Ejemplo n.º 1
0
def mergeRegionAssemblies(regionDirList, options):
    count = 0
    c_count = 0
    g_count = 0
    fadic = {}
    orderlist = []
    desdic= {}

    for tmpdir in regionDirList:
        clist, glist = [],[]
        tridir = '%s.trinity.Trinity.fasta' %tmpdir
        if not os.path.isfile(tridir):
            continue
        tmpfa = Fasta.Parse(tridir)
        for seqid in tmpfa.id:
            count += 1
            id2 = seqid.split('|')[1]
            c, g, i = id2.split('_')
            if c not in clist:
                clist.append(c)
                c_count += 1
            if g not in glist:
                glist.append(g)
                g_count += 1
            newid = 'Transcript%d|c%d_g%d_%s' %(count,c_count,g_count,i)
            chrid = tmpdir.split('/')[-2]
            regid = tmpdir.split('/')[-1][6:]
            seqlen= len(tmpfa.seq[seqid])
            fadic[newid] = tmpfa.seq[seqid]
            desdic[newid] = '%s:%s len=%d' %(chrid,regid,seqlen)
            orderlist.append(newid)
    Fasta.write(fadic, '%s/Transcript.fa' %options.outpath, orderlist=orderlist, description=desdic)
Ejemplo n.º 2
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the mixture weights
    weights = [fs.weight_a, fs.weight_b, fs.weight_c]
    # get the matrices
    matrices = [fs.matrix_a, fs.matrix_b, fs.matrix_c]
    for R in matrices:
        if R.shape != (4, 4):
            msg = 'expected each nucleotide rate matrix to be 4x4'
            raise HandlingError(msg)
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(fs.alignment.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # create the mixture proportions
    weight_sum = sum(weights)
    mixture_proportions = [weight / weight_sum for weight in weights]
    # create the rate matrix objects
    ordered_states = list('ACGT')
    rate_matrix_objects = []
    for R in matrices:
        rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states)
        rate_matrix_objects.append(rate_matrix_object)
    # create the mixture model
    mixture_model = SubModel.MixtureModel(mixture_proportions,
                                          rate_matrix_objects)
    # normalize the mixture model
    mixture_model.normalize()
    # return the html string
    return do_analysis(mixture_model, alignment, tree) + '\n'
Ejemplo n.º 3
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # define the jukes cantor rate matrix
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
        dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix,
                                               ordered_states)
    # simulate the ancestral alignment
    try:
        alignment = PhyLikelihood.simulate_ancestral_alignment(
            tree, alignment, rate_matrix_object)
    except PhyLikelihood.SimulationError as e:
        raise HandlingError(e)
    # get the alignment string using an ordering defined by the tree
    arr = []
    for node in tree.preorder():
        arr.append(alignment.get_fasta_sequence(node.name))
    # return the response
    return '\n'.join(arr) + '\n'
Ejemplo n.º 4
0
def get_response_content(fs):
    # read the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() != 2:
        raise HandlingError('expected a sequence pair')
    # read the rate matrix
    R = fs.matrix
    # read the ordered states
    ordered_states = Util.get_stripped_lines(fs.states.splitlines())
    if len(ordered_states) != len(R):
        msg_a = 'the number of ordered states must be the same '
        msg_b = 'as the number of rows in the rate matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_states)) != len(ordered_states):
        raise HandlingError('the ordered states must be unique')
    # create the rate matrix object using the ordered states
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) 
    # create the objective function
    objective = Objective(alignment.sequences, rate_matrix_object)
    # Use golden section search to find the mle distance.
    # The bracket is just a suggestion.
    bracket = (0.51, 2.01)
    mle_distance = optimize.golden(objective, brack=bracket)
    # write the response
    out = StringIO()
    print >> out, 'maximum likelihood distance:', mle_distance
    #distances = (mle_distance, 0.2, 2.0, 20.0)
    #for distance in distances:
        #print >> out, 'f(%s): %s' % (distance, objective(distance))
    return out.getvalue()
Ejemplo n.º 5
0
 def test_seq_len(self):
     seq_lens = [12, 14, 9, 11, 16]
     _file = os.path.join(os.getcwd(), "_test_fasta.fa.gz")
     ifile = gzip.open(_file, 'rt')
     _test_fasta = Fasta.Fasta(ifile)
     for _fas in _test_fasta:
         self.assertTrue(_fas.length in seq_lens)
Ejemplo n.º 6
0
def make_xml(start_pos, stop_pos, nsamples):
    """
    @return: location of xml file, location of log file
    """
    out = StringIO()
    print >> out, g_xml_pre_alignment
    print >> out, """
        <!-- The sequence alignment (each sequence refers to a taxon above). -->
        <alignment id="alignment" dataType="nucleotide">
    """
    lines = g_fasta_string.splitlines()
    for header, seq in Fasta.gen_header_sequence_pairs(lines):
        print >> out, '<sequence>'
        print >> out, '<taxon idref="%s"/>' % header
        print >> out, seq
        print >> out, '</sequence>'
    print >> out, '</alignment>'
    print >> out, """
        <patterns id="firsthalf.patterns" from="%d" to="%d">
            <alignment idref="alignment"/>
        </patterns>
    """ % (start_pos, stop_pos)
    print >> out, get_xml_post_alignment(nsamples)
    log_loc = Util.get_tmp_filename(prefix='beast', suffix='.log')
    print >> out, get_log_xml(log_loc)
    xml_loc = Util.create_tmp_file(
            out.getvalue(), prefix='beast', suffix='.xml')
    return xml_loc, log_loc
Ejemplo n.º 7
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the sequence order if it exists
    ordered_names = Util.get_stripped_lines(fs.order.splitlines())
    if ordered_names:
        observed_name_set = set(ordered_names)
        expected_name_set = set(node.get_name() for node in tree.gen_tips())
        extra_names = observed_name_set - expected_name_set
        missing_names = expected_name_set - observed_name_set
        if extra_names:
            msg_a = 'the list of ordered names includes these names '
            msg_b = 'not found in the tree: %s' % str(tuple(extra_names))
            raise HandlingError(msg_a + msg_b)
        if missing_names:
            msg_a = 'the tree includes these names not found in the list '
            msg_b = 'of ordered names: %s' % str(tuple(missing_names))
            raise HandlingError(msg_a + msg_b)
    else:
        ordered_names = list(tip.get_name() for name in tree.gen_tips())
    # do the sampling
    sampled_sequences = JC69.sample_sequences(tree, ordered_names, fs.length)
    alignment = Fasta.create_alignment(ordered_names, sampled_sequences)
    # return the response
    return alignment.to_fasta_string() + '\n'
Ejemplo n.º 8
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the sequence order if it exists
    ordered_names = Util.get_stripped_lines(fs.order.splitlines())
    if ordered_names:
        observed_name_set = set(ordered_names)
        expected_name_set = set(node.get_name() for node in tree.gen_tips())
        extra_names = observed_name_set - expected_name_set
        missing_names = expected_name_set - observed_name_set
        if extra_names:
            msg_a = 'the list of ordered names includes these names '
            msg_b = 'not found in the tree: %s' % str(tuple(extra_names))
            raise HandlingError(msg_a + msg_b)
        if missing_names:
            msg_a = 'the tree includes these names not found in the list '
            msg_b = 'of ordered names: %s' % str(tuple(missing_names))
            raise HandlingError(msg_a + msg_b)
    else:
        ordered_names = list(tip.get_name() for name in tree.gen_tips())
    # do the sampling
    sampled_sequences = JC69.sample_sequences(tree, ordered_names, fs.length)
    alignment = Fasta.create_alignment(ordered_names, sampled_sequences)
    # return the response
    return alignment.to_fasta_string() + '\n'
Ejemplo n.º 9
0
def main():
    # create the alignment object
    print 'creating the alignment...'
    alignment_string = Fasta.brown_example_alignment.strip()
    alignment = Fasta.Alignment(StringIO(alignment_string))
    # create a tree object
    print 'creating the tree...'
    tree_string = Newick.brown_example_tree
    tree = Newick.parse(tree_string, Newick.NewickTree)
    # create a rate matrix object
    print 'creating the rate matrix object...'
    distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25}
    kappa = 2.0
    row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix(
        distribution, kappa).get_row_major_rate_matrix()
    rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix,
                                            list('ACGT'))
    rate_matrix.normalize()
    # get the mle_rates
    print 'getting the mle rates...'
    mle_rates = get_mle_rates(tree, alignment, rate_matrix)
    print 'mle rates:'
    print mle_rates
    print 'stockholm string:'
    print get_stockholm_string(tree, alignment, mle_rates)
Ejemplo n.º 10
0
def simulate_ancestral_alignment(tree, alignment, substitution_model):
    """
    @param tree: a newick tree with branch lengths
    @param alignment: a Fasta Alignment object with headers that match the tree tip names
    @param substitution_model: a way to simulate ancestral states from a tree given its leaf states
    @return: a Fasta Alignment object of the simulated ancestral sequences
    """
    for node in tree.gen_non_root_nodes():
        if node.get_branch_length() is None or node.get_branch_length() <= 0:
            raise SimulationError('all branch lengths should be positive')
    for node in tree.gen_internal_nodes():
        if not node.name:
            raise SimulationError('all internal nodes should be named')
    simulated_ancestors = dict((node.name, []) for node in tree.gen_internal_nodes())
    for col in alignment.columns:
        name_to_letter = dict(zip(alignment.headers, col))
        # Augment each tip with its corresponding letter.
        for tip in tree.gen_tips():
            tip.state = name_to_letter[tip.name]
        # Do the simulation.
        substitution_model.simulate_ancestral_states(tree)
        name_state_pairs = [(node.name, node.state) for node in tree.gen_internal_nodes_preorder()]
        # Add this simulated column.
        for name, state in name_state_pairs:
            simulated_ancestors[name].append(state)
    # Create an alignment object from the simulated sequences.
    sio = StringIO()
    print >> sio, alignment.to_fasta_string()
    for header, sequence in simulated_ancestors.items():
        print >> sio, '>' + header
        print >> sio, ''.join(sequence)
    fasta_string = sio.getvalue()
    return Fasta.Alignment(StringIO(fasta_string))
Ejemplo n.º 11
0
def make_xml(start_pos, stop_pos, nsamples):
    """
    @return: location of xml file, location of log file
    """
    out = StringIO()
    print >> out, g_xml_pre_alignment
    print >> out, """
        <!-- The sequence alignment (each sequence refers to a taxon above). -->
        <alignment id="alignment" dataType="nucleotide">
    """
    lines = g_fasta_string.splitlines()
    for header, seq in Fasta.gen_header_sequence_pairs(lines):
        print >> out, '<sequence>'
        print >> out, '<taxon idref="%s"/>' % header
        print >> out, seq
        print >> out, '</sequence>'
    print >> out, '</alignment>'
    print >> out, """
        <patterns id="firsthalf.patterns" from="%d" to="%d">
            <alignment idref="alignment"/>
        </patterns>
    """ % (start_pos, stop_pos)
    print >> out, get_xml_post_alignment(nsamples)
    log_loc = Util.get_tmp_filename(prefix='beast', suffix='.log')
    print >> out, get_log_xml(log_loc)
    xml_loc = Util.create_tmp_file(out.getvalue(),
                                   prefix='beast',
                                   suffix='.xml')
    return xml_loc, log_loc
Ejemplo n.º 12
0
def get_response_content(fs):
    # read the nucleotide weights
    nt_weights = [fs.A, fs.C, fs.G, fs.T]
    # convert the nucleotide weights to probabilities
    nt_probs = [x / float(sum(nt_weights)) for x in nt_weights]
    # Assert that the kappa value and the nucleotide
    # probabilities are compatible.
    A, C, G, T = nt_probs
    R = float(A + G)
    Y = float(C + T)
    if R <= 0:
        raise HandlingError('the frequency of a purine must be positive')
    if Y <= 0:
        raise HandlingError('the frequency of a pyrimidine must be positive')
    if fs.kappa <= max(-Y, -R):
        msg_a = 'kappa must be greater than max(-R, -Y) '
        msg_b = 'where R and Y are the purine and pyrimidine frequencies'
        raise HandlingError(msg_a + msg_b)
    # Create the rate matrix object
    # which is automatically scaled to a rate of 1.0.
    model = F84.create_rate_matrix(fs.kappa, nt_probs)
    # simulate a pair of sequences
    sequence_pair = PairLikelihood.simulate_sequence_pair(
        fs.distance, model, fs.length)
    # convert the pair of sequences to an alignment object
    aln = StringIO()
    print >> aln, '>first'
    print >> aln, ''.join(sequence_pair[0])
    print >> aln, '>second'
    print >> aln, ''.join(sequence_pair[1])
    return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + '\n'
Ejemplo n.º 13
0
def parseGumby(gumbyFile, exonFile, baseSeq):
# parses gumbyFile, removes things that overlap exons and gumbies that consist only of gaps on baseSeq
# returns a list of gumbyBlocks

    infile = open(gumbyFile, "r")

    exons = []
    if exonFile!=None:
        fh = open(exonFile, "r")
        for l in fh:
            fs = l.split()
            if fs[0].lower()!=baseSeq:
                continue
            exons.append([ int(fs[3]), int(fs[4]) ] )
    # print exons

    re1 = compile("[a-z]+[ ]+[0-9]+[ ]+[0-9]+")
    seqs = {}
    pos = {}
    i = -1

    resultLst = alignment.Alignment()
    for l in infile:
        l = l.strip()
        l = l.replace("*","-")
        l = l.replace("<", "-")
        l = l.replace(">", "-")
        if l.startswith("start"):
            if i!=-1:
                resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score))
            f = l.split()
            pval = float(f[-1])
            length = int(f[6].strip(","))
            score = int(f[8].strip(","))
            i+=1
            seqs={}

        if re1.match(l):
            f = l.split()
            name = f[0]
            start = int(f[1])-1
            end = int(f[2])-1

            seq = f[3]
            if name not in seqs:
                faseq = Fasta.FastaSeq(name, seq)
                faseq.chrom = name
                faseq.start = start
                faseq.end = end
                seqs[name] = faseq
            else:
                faseq = seqs[f[0]] 
                faseq.nucl += f[3]
            pos[name] = (name, start,end)

    resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score))
    return resultLst
Ejemplo n.º 14
0
 def test_string_filename(self):
     known_seq = [
         'GGGACAGGGGGC', 'GGGACTGGGGGGGC', 'ATGGCATAT', 'ATGGCATATCA',
         'ATCGGAGGGATACGAG'
     ]
     ifile = os.path.join(os.getcwd(), "_test_fasta.fa")
     _test_fasta = Fasta.Fasta(ifile)
     for _fas in _test_fasta:
         self.assertTrue(_fas.sequence in known_seq)
Ejemplo n.º 15
0
 def test_open_filename(self):
     known_seq = [
         'GGGACAGGGGGC', 'GGGACTGGGGGGGC', 'ATGGCATAT', 'ATGGCATATCA',
         'ATCGGAGGGATACGAG'
     ]
     _file = os.path.join(os.getcwd(), "_test_fasta.fa.gz")
     ifile = gzip.open(_file, 'rt')
     _test_fasta = Fasta.Fasta(ifile)
     for _fas in _test_fasta:
         self.assertTrue(_fas.sequence in known_seq)
Ejemplo n.º 16
0
def get_amino_acid_alignment(table):
    """
    @param table: a table of data in some random format sent by Ferran Casals
    @return: a Fasta amino acid alignment object
    """
    if len(table) < 2:
        raise HandlingError('the data table should have at least two rows')
    first_row = table[0]
    if len(first_row) < 6:
        raise HandlingError(
                'the first row of the table has %d columns '
                'but at least six were expected' % len(first_row))
    if first_row[0].upper() != 'variant'.upper():
        raise HandlingError('expected the first column to be the variant')
    if first_row[1].upper() != 'chr'.upper():
        raise HandlingError('expected the second column to be the chromosome')
    if first_row[2].upper() != 'position'.upper():
        raise HandlingError('expected the third column to be the position')
    if first_row[3].upper() != 'Amino Acid Change'.upper():
        raise HandlingError(
                'expected the fourth column to be the amino acid change')
    if first_row[4].upper() != 'alleles'.upper():
        raise HandlingError(
                'expected the fifth column to be the nucleotide change')
    remaining_rows = table[1:]
    for row in remaining_rows:
        if len(row) != len(first_row):
            raise HandlingError(
                    'each row should have the same number of columns')
    # get the ordered taxa
    taxa = first_row[5:]
    if len(set(taxa)) != len(taxa):
        raise HandlingError('the same taxon appears in more than one column')
    # get the sequence of codons for each taxon
    codon_sequences = zip(*remaining_rows)[5:]
    # convert codon sequences to amino acid sequences
    aa_sequences = []
    for codon_sequence in codon_sequences:
        aa_list = []
        for codon in codon_sequence:
            codon = codon.upper()
            if codon == 'ND':
                aa = '-'
            elif codon in Codon.g_non_stop_codons:
                aa = Codon.g_codon_to_aa_letter[codon]
            elif codon in Codon.g_stop_codons:
                raise HandlingError(
                        'one of the codons is a stop codon: %s' % codon)
            else:
                raise HandlingError(
                        'one of the codons is invalid: %s' % codon)
            aa_list.append(aa)
        aa_sequences.append(''.join(aa_list))
    # return the alignment
    return Fasta.create_alignment(taxa, aa_sequences)
Ejemplo n.º 17
0
def calcGC(targetDic, faPath):
    tmpdic = {}
    fa = Fasta.Parse(faPath)
    for chrid in targetDic:
        print chrid
        tmpdic[chrid] = {}
        for item in targetDic[chrid]:
            seq = fa.seq[chrid][targetDic[chrid][item][0]:targetDic[chrid][item][1]].upper()
            gc = (seq.count('G') + seq.count('C') ) / len(seq)
            tmpdic[chrid][item] = gc
    return tmpdic
Ejemplo n.º 18
0
 def test_likelihood_calculation(self):
     # get a tree
     tree = Newick.parse(sample_tree_string, Newick.NewickTree)
     # get a model
     input_xml_string = get_sample_xml_string()
     model = deserialize_mixture_model(input_xml_string)
     # get an alignment
     alignment = Fasta.CodonAlignment(
         StringIO(long_sample_codon_alignment_string))
     # get the likelihood
     log_likelihood = PhyLikelihood.get_log_likelihood(
         tree, alignment, model)
Ejemplo n.º 19
0
def get_amino_acid_alignment(table):
    """
    @param table: a table of data in some random format sent by Ferran Casals
    @return: a Fasta amino acid alignment object
    """
    if len(table) < 2:
        raise HandlingError('the data table should have at least two rows')
    first_row = table[0]
    if len(first_row) < 6:
        raise HandlingError('the first row of the table has %d columns '
                            'but at least six were expected' % len(first_row))
    if first_row[0].upper() != 'variant'.upper():
        raise HandlingError('expected the first column to be the variant')
    if first_row[1].upper() != 'chr'.upper():
        raise HandlingError('expected the second column to be the chromosome')
    if first_row[2].upper() != 'position'.upper():
        raise HandlingError('expected the third column to be the position')
    if first_row[3].upper() != 'Amino Acid Change'.upper():
        raise HandlingError(
            'expected the fourth column to be the amino acid change')
    if first_row[4].upper() != 'alleles'.upper():
        raise HandlingError(
            'expected the fifth column to be the nucleotide change')
    remaining_rows = table[1:]
    for row in remaining_rows:
        if len(row) != len(first_row):
            raise HandlingError(
                'each row should have the same number of columns')
    # get the ordered taxa
    taxa = first_row[5:]
    if len(set(taxa)) != len(taxa):
        raise HandlingError('the same taxon appears in more than one column')
    # get the sequence of codons for each taxon
    codon_sequences = zip(*remaining_rows)[5:]
    # convert codon sequences to amino acid sequences
    aa_sequences = []
    for codon_sequence in codon_sequences:
        aa_list = []
        for codon in codon_sequence:
            codon = codon.upper()
            if codon == 'ND':
                aa = '-'
            elif codon in Codon.g_non_stop_codons:
                aa = Codon.g_codon_to_aa_letter[codon]
            elif codon in Codon.g_stop_codons:
                raise HandlingError('one of the codons is a stop codon: %s' %
                                    codon)
            else:
                raise HandlingError('one of the codons is invalid: %s' % codon)
            aa_list.append(aa)
        aa_sequences.append(''.join(aa_list))
    # return the alignment
    return Fasta.create_alignment(taxa, aa_sequences)
Ejemplo n.º 20
0
    def runPairWiseDiffs(self, fastaFileNames):

        print 'Calculating pairwise diffenrences...',

        # Read in fasta sequences into a dictionary:
        completeSets = {}
        for fastaFileName in fastaFileNames:
            baseName = os.path.splitext(os.path.basename(fastaFileName))[0]
            #baseName = os.path.basename(fastaFileName).split(".")[0]
            completeSets[baseName] = {}

            fastaFile = open(fastaFileName, 'r')
            fastaIterator = Fasta.Iterator(fastaFile,
                                           parser=Fasta.RecordParser())
            for fastaRecord in fastaIterator:
                newName = safeName(copy.copy(fastaRecord.title))
                #completeSets[baseName][fastaRecord.title.strip()] = fastaRecord.sequence
                completeSets[baseName][newName] = fastaRecord.sequence
            fastaFile.close()

        # Load existing alignment matrix
        alignmentMatrices = {}
        for fastaFileBaseName in completeSets.keys():
            if not alignmentMatrices.has_key(fastaFileBaseName):
                alignmentMatrices[fastaFileBaseName] = {}

            alignmentMatrixFileName = os.path.join(
                self.options.statsdir, fastaFileBaseName + "_matrix.pickle")
            if os.path.exists(alignmentMatrixFileName) and os.path.getsize(
                    alignmentMatrixFileName) > 0:
                alignmentMatrixFile = open(alignmentMatrixFileName, 'r')
                alignmentMatrices[fastaFileBaseName] = pickle.load(
                    alignmentMatrixFile)
                alignmentMatrixFile.close()

        # Add any new alignments to alignment matrix (and save to them to file)
        self.updateAlignmentMatrix(alignmentMatrices, completeSets)

        print 'done'
Ejemplo n.º 21
0
 def test_simulation(self):
     tree_string = '(((Human:0.1, Chimpanzee:0.2)to-chimp:0.8, Gorilla:0.3)to-gorilla:0.7, Orangutan:0.4, Gibbon:0.5)all;'
     # Parse the example tree.
     tree = Newick.parse(tree_string, Newick.NewickTree)
     tree.assert_valid()
     # Get header and sequence pairs.
     alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment))
     # Get the Jukes-Cantor rate matrix object.
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # Simulate ancestral states.
     simulated_alignment = simulate_ancestral_alignment(tree, alignment, rate_matrix_object)
Ejemplo n.º 22
0
def get_response_content(fs):
    out = StringIO()
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        print >> out, 'This is a valid alignment.'
    except Fasta.AlignmentError as e:
        alignment = None
        print >> out, 'This is not a valid alignment:', e
    if alignment:
        try:
            old_column_count = len(alignment.columns)
            alignment.force_nucleotide()
            removed_column_count = old_column_count - len(alignment.columns)
            if removed_column_count:
                print >> out, ('After removing %d' % removed_column_count),
                print >> out, 'columns this is a valid nucleotide alignment.'
            else:
                print >> out, 'This is a valid nucleotide alignment.'
        except Fasta.AlignmentError as e:
            print >> out, 'This is not a valid nucleotide alignment:', e
    for header, seq in Fasta.gen_header_sequence_pairs(StringIO(fs.fasta)):
        print >> out, '%s: %d' % (header, len(seq))
    return out.getvalue()
Ejemplo n.º 23
0
 def test_gc_content(self):
     known_seq = [
         'GGGACAGGGGGC', 'GGGACTGGGGGGGC', 'ATGGCATAT', 'ATGGCATATCA',
         'ATCGGAGGGATACGAG'
     ]
     gc_ = [
         0.833333333333, 0.857142857143, 0.333333333333, 0.363636363636,
         0.5625
     ]
     gc_dict = dict(zip(known_seq, gc_))
     ifile = os.path.join(os.getcwd(), "_test_fasta.fa.gz")
     _test_fasta = Fasta.Fasta(ifile)
     for _fas in _test_fasta:
         self.assertAlmostEqual(_fas.gc, gc_dict[_fas.sequence], places=4)
Ejemplo n.º 24
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the nucleotide alignment
    try:
        alignment = Fasta.Alignment(fs.alignment.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the normalized Direct RNA mixture model
    mixture_model = DirectRna.deserialize_mixture_model(fs.model)
    mixture_model.normalize()
    # return the html string
    return do_analysis(mixture_model, alignment, tree) + '\n'
Ejemplo n.º 25
0
 def test_likelihood(self):
     # Parse the example tree.
     tree_string = Newick.brown_example_tree
     tree = Newick.parse(tree_string, Newick.NewickTree)
     tree.assert_valid()
     # Get header and sequence pairs.
     alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment))
     # Get the Jukes-Cantor rate matrix object.
     dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
     ordered_states = list('ACGT')
     row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states)
     rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states)
     # Calculate the log likelihood.
     log_likelihood = get_log_likelihood(tree, alignment, rate_matrix_object)
     self.assertAlmostEqual(log_likelihood, -4146.26547208)
Ejemplo n.º 26
0
def get_response_content(fs):
    # read the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() < 2:
        raise HandlingError('expected at least two sequences')
    # Create the distance matrix,
    # replacing values of None with the representation for infinity.
    row_major_distance_matrix = []
    for row in JC69.get_ML_distance_matrix(alignment.sequences):
        corrected_row = [fs.infinity if x == float('inf') else x for x in row]
        row_major_distance_matrix.append(corrected_row)
    # return the response
    return MatrixUtil.m_to_string(row_major_distance_matrix) + '\n'
Ejemplo n.º 27
0
def protsplit(protstr, prefix='', minlen=0, offset=0, dir='fwd'):
    prot = 'ACDEFGHIKLMNPQRSTVWY'
    re_prot = re.compile('[%s]+' % prot)
    out = []
    plen = len(protstr)
    nlen = plen * 3
    for match in re_prot.finditer(protstr):
        #		print prefix,offset,match.start(),match.end()
        start = match.start()
        end = match.end()
        if end - start < minlen: continue
        nucleotide_start = start * 3 + offset
        if dir == 'rvs': nucleotide_start = nlen - start * 3 - offset
        out.append(
            Fasta(protstr[match.start():match.end()],
                  '%s_%i' % (prefix, nucleotide_start)))
    return out
Ejemplo n.º 28
0
def parse(gff_file, base=None):
	ins_file=gff_file.replace(".gff","")+".ins"
	insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True)
	calls=[]
	for entry in open(gff_file, "r"):
		if entry.startswith("#"): continue
		try:
			call=Call(entry, base=base)
			if insertions is not None and call.id in insertions: call.inserted=str(insertions[call.id].seq)
			elif "Iseq" in call.attributes: 
				call.inserted=call.attributes["Iseq"]
				del call.attributes["Iseq"]
			calls.append(call)
		except:
			print >> sys.stderr, "Unable to parse line: %s" % entry
			raise
	return calls
Ejemplo n.º 29
0
def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # read the alignment
    try:
        alignment = Fasta.Alignment(StringIO(fs.fasta))
    except Fasta.AlignmentError as e:
        raise HandlingError('fasta alignment error: ' + str(e))
    if alignment.get_sequence_count() < 2:
        raise HandlingError('expected at least two sequences')
    # read the rate matrix
    R = fs.matrix
    # read the ordered states
    ordered_states = Util.get_stripped_lines(StringIO(fs.states))
    if len(ordered_states) != len(R):
        msg_a = 'the number of ordered states must be the same '
        msg_b = 'as the number of rows in the rate matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_states)) != len(ordered_states):
        raise HandlingError('the ordered states must be unique')
    # create the rate matrix object using the ordered states
    rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states)
    # create the distance matrix
    n = alignment.get_sequence_count()
    row_major_distance_matrix = [[0] * n for i in range(n)]
    for i, sequence_a in enumerate(alignment.sequences):
        for j, sequence_b in enumerate(alignment.sequences):
            if i < j:
                # create the objective function using the sequence pair
                objective = Objective((sequence_a, sequence_b),
                                      rate_matrix_object)
                # Use golden section search to find the mle distance.
                # The bracket is just a suggestion.
                bracket = (0.51, 2.01)
                mle_distance = optimize.golden(objective, brack=bracket)
                # fill two elements of the matrix
                row_major_distance_matrix[i][j] = mle_distance
                row_major_distance_matrix[j][i] = mle_distance
    # write the response
    out = StringIO()
    print >> out, 'maximum likelihood distance matrix:'
    print >> out, MatrixUtil.m_to_string(row_major_distance_matrix)
    return out.getvalue()
Ejemplo n.º 30
0
def get_response_content(fs):
    # get the alignment object
    try:
        alignment = Fasta.Alignment(StringIO(fs.fasta))
    except Fasta.AlignmentError as e:
        raise HandlingError('alignment error: ' + str(e))
    # assert that the alignment is of exactly two sequences
    if len(alignment.sequences) != 2:
        raise HandlingError('expected a pair of sequences')
    # assert that the alignment is a gapless unambiguous nucleotide alignment
    old_column_count = alignment.get_column_count()
    try:
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError('nucleotide alignment error: ' + str(e))
    new_column_count = alignment.get_column_count()
    if old_column_count != new_column_count:
        msg = 'expected a gapless unambiguous nucleotide alignment'
        raise HandlingError(msg)
    # get the maximum likelihood estimates according to a numeric optimizer.
    f = F84.Objective(alignment.sequences)
    values = list(f.get_initial_parameters())
    result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0)
    distance, kappa, wC, wG, wT = result
    nt_distribution = F84.parameters_to_distribution((wC, wG, wT))
    A, C, G, T = nt_distribution
    model = F84.create_rate_matrix(kappa, nt_distribution)
    log_likelihood = PairLikelihood.get_log_likelihood(distance,
                                                       alignment.sequences,
                                                       model)
    # begin the response
    out = StringIO()
    print >> out, 'ML distance:', distance
    print >> out, 'ML kappa:', kappa
    print >> out, 'ML A frequency:', A
    print >> out, 'ML C frequency:', C
    print >> out, 'ML G frequency:', G
    print >> out, 'ML T frequency:', T
    print >> out, 'log likelihood:', log_likelihood
    # write the response
    return out.getvalue()
Ejemplo n.º 31
0
def get_response_content(fs):
    # get the tree
    tree = Newick.parse(fs.tree, Newick.NewickTree)
    tree.assert_valid()
    # get the alignment
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError(e)
    # get the log likelihood
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix()
    ordered_states = list('ACGT')
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states)
    log_likelihood = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # return the response
    return str(log_likelihood) + '\n'
Ejemplo n.º 32
0
def simulate_alignment(tree, substitution_model, ncolumns, seed=None):
    """
    @param tree: a newick tree with branch lengths
    @param substitution_model: a way to simulate states on a tree
    @param ncolumns: the number of columns to simulate
    @param seed: a random number seed
    @return: a Fasta Alignment object of the simulated sequences
    """
    # Check the input.
    for node in tree.gen_non_root_nodes():
        if node.get_branch_length() is None or node.get_branch_length() <= 0:
            raise SimulationError('all branch lengths should be positive')
    tip_names = [node.name for node in tree.gen_tips()]
    for name in tip_names:
        if not name:
            raise SimulationError('each leaf should have a name')
    if len(tip_names) != len(set(tip_names)):
        raise SimulationError('each leaf should have a unique name')
    # Save the rng state if we are using a seed.
    if seed is not None:
        old_rng_state = random.getstate()
    # Seed the rng if we are using a seed.
    if seed is not None:
        random.seed(seed)
    # Simulate the states on the tree.
    simulated_sequences = dict((node.name, []) for node in tree.gen_tips())
    for column_index in range(ncolumns):
        substitution_model.simulate_states(tree)
        for node in tree.gen_tips():
            simulated_sequences[node.name].append(node.state)
    # Restore the rng state if we are using a seed
    if seed is not None:
        random.setstate(old_rng_state)
    # Create an alignment object from the simulated sequences.
    sio = StringIO()
    for header, sequence in simulated_sequences.items():
        print >> sio, '>' + header
        print >> sio, ''.join(sequence)
    fasta_string = sio.getvalue()
    return Fasta.Alignment(StringIO(fasta_string))
Ejemplo n.º 33
0
def get_response_content(fs):
    # get the alignment object
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
    except Fasta.AlignmentError as e:
        raise HandlingError('alignment error: ' + str(e))
    # assert that the alignment is of exactly two sequences
    if len(alignment.sequences) != 2:
        raise HandlingError('expected a pair of sequences')
    # assert that the alignment is a gapless unambiguous nucleotide alignment
    old_column_count = alignment.get_column_count()
    try:
        alignment.force_nucleotide()
    except Fasta.AlignmentError as e:
        raise HandlingError('nucleotide alignment error: ' + str(e))
    new_column_count = alignment.get_column_count()
    if old_column_count != new_column_count:
        msg = 'expected a gapless unambiguous nucleotide alignment'
        raise HandlingError(msg)
    # get the maximum likelihood estimates
    sequence_pair = alignment.sequences
    distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair)
    # get the log likelihood
    nt_distribution = (A, C, G, T)
    rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution)
    log_likelihood = PairLikelihood.get_log_likelihood(distance,
                                                       alignment.sequences,
                                                       rate_matrix_object)
    # begin the response
    out = StringIO()
    print >> out, 'distance:', distance
    print >> out, 'kappa:', kappa
    print >> out, 'A frequency:', A
    print >> out, 'C frequency:', C
    print >> out, 'G frequency:', G
    print >> out, 'T frequency:', T
    print >> out, 'log likelihood:', log_likelihood
    # return the response
    return out.getvalue()
Ejemplo n.º 34
0
def parse(gff_file, base=None):
	logger = logging.getLogger(parse.__name__)

	ins_file=gff_file.replace(".gff","")+".ins"
	insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True)
	if insertions is None:
		logger.warn("Insertion sequence file %s missing" % ins_file)

	calls=[]
	for entry in open(gff_file, "r"):
		if entry.startswith("#"): continue
		try:
			call=Call(entry, base=base)
			if insertions is not None and call.id in insertions: call.inserted=str(insertions[call.id].seq)
			elif "Iseq" in call.attributes: 
				call.inserted=call.attributes["Iseq"]
				del call.attributes["Iseq"]
			calls.append(call)
		except:
			logger.error("Unable to parse line: %s" % entry)
			raise
	return calls
Ejemplo n.º 35
0
def convert(inputVCFFile='', outputVariantFile='', parameters={}):

    fo = open(outputVariantFile, 'w')

    fa = Fasta.Fasta(fname=parameters['refFile'])

    vcffiles = inputVCFFile.split(',')

    for vcffile in vcffiles:
        vcf = VCFFile.VCFFile(fname=vcffile, mode='r')

        while True:
            dat = vcf.readline()

            if dat == {}:
                break

            pos = int(dat['POS'])
            chr = dat['CHROM']
            ref = dat['REF']

            rseq = ''.join(fa.get(chr, pos, len(ref)))
            if rseq != ref:
                sys.stderr.write("REFSEQ inconsistency\n")

            if float(dat['QUAL']) >= parameters['minQual']:
                altseq = dat['ALT'].split(',')

                for alt in altseq:
                    if alt != "<DEL>" and len(alt) != len(ref):
                        var = Variant.Variant4(ref=ref, alt=alt)
                        if var.type == "ins" or var.type == "del":
                            fo.write("%s %d %s\n" %
                                     (chr, pos + var.offset - 1, var.str))

    fo.close()

    vcf.close()
Ejemplo n.º 36
0
def get_response_content(fs):
    out = StringIO()
    try:
        alignment = Fasta.Alignment(fs.fasta.splitlines())
        print >> out, 'This is a valid alignment.'
    except Fasta.AlignmentError as e:
        alignment = None
        print >> out, 'This is not a valid alignment:', e
    if alignment:
        try:
            old_column_count = len(alignment.columns)
            alignment.force_nucleotide()
            removed_column_count = old_column_count - len(alignment.columns)
            if removed_column_count:
                print >> out, ('After removing %d' % removed_column_count),
                print >> out, 'columns this is a valid nucleotide alignment.'
            else:
                print >> out, 'This is a valid nucleotide alignment.'
        except Fasta.AlignmentError as e:
            print >> out, 'This is not a valid nucleotide alignment:', e
    for header, seq in Fasta.gen_header_sequence_pairs(StringIO(fs.fasta)):
        print >> out, '%s: %d' % (header, len(seq))
    return out.getvalue()
Ejemplo n.º 37
0
import re

if __name__ == '__main__':
    #print(sys.argv)
    inFile = sys.stdin
else:
    workDir = '/home/ashis/work/github/courses/JHU_Computational_Genomics/HW3'
    os.chdir(workDir)
    sys.path.append(os.path.abspath(os.getcwd()))
    inputFilePath = 'data/4_subsequence_1.txt'
    inFile = open(inputFilePath, 'r')

import Fasta

# read inputs
dnas = Fasta.parse_fasta(inFile)
inFile.close()

s = dnas[0]
t = dnas[1]

subseqIndexes = []

si = -1
for ti in range(len(t)):
    tchar = t[ti]
    while 1:
        si += 1
        if s[si] == tchar:
             subseqIndexes.append(si)
             break
Ejemplo n.º 38
0
def get_response_content(fs):
    # init the response and get the user variables
    out = StringIO()
    nleaves = fs.nleaves
    nvertices = nleaves * 2 - 1
    nbranches = nvertices - 1
    nsites = fs.nsites
    # sample the coalescent tree with timelike branch lengths
    R, B = kingman.sample(fs.nleaves)
    r = Ftree.R_to_root(R)
    # get the leaf vertex names
    N = dict(zip(range(nleaves), string.uppercase[:nleaves]))
    N_leaves = dict(N)
    # get the internal vertex names
    v_to_leaves = R_to_v_to_leaves(R)
    for v, leaves in sorted(v_to_leaves.items()):
        if len(leaves) > 1:
            N[v] = ''.join(sorted(N[leaf] for leaf in leaves))
    # get vertex ages
    v_to_age = kingman.RB_to_v_to_age(R, B)
    # sample the rates on the branches
    b_to_rate = sample_b_to_rate(R)
    xycorr = get_correlation(R, b_to_rate)
    # define B_subs in terms of substitutions instead of time
    B_subs = dict((p, t * b_to_rate[p]) for p, t in B.items())
    # sample the alignment
    v_to_seq = sample_v_to_seq(R, B_subs, nsites)
    # get the log likelihood; this is kind of horrible
    pairs = [(N[v], ''.join(v_to_seq[v])) for v in range(nleaves)]
    headers, sequences = zip(*pairs)
    alignment = Fasta.create_alignment(headers, sequences)
    newick_string = FtreeIO.RBN_to_newick(R, B_subs, N_leaves)
    tree = Newick.parse(newick_string, Newick.NewickTree)
    dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() 
    ordered_states = list('ACGT') 
    row_major_rate_matrix = MatrixUtil.dict_to_row_major(
            dictionary_rate_matrix, ordered_states, ordered_states)
    rate_matrix_object = RateMatrix.RateMatrix(
            row_major_rate_matrix, ordered_states) 
    ll = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # get ll when rates are all 1.0
    newick_string = FtreeIO.RBN_to_newick(R, B, N_leaves)
    tree = Newick.parse(newick_string, Newick.NewickTree)
    ll_unity = PhyLikelihood.get_log_likelihood(
            tree, alignment, rate_matrix_object)
    # get ll when rates are numerically optimized
    # TODO incorporate the result into the xml file
    # TODO speed up the likelihood evaluation (beagle? C module?)
    #f = Opt(R, B, N_leaves, alignment)
    #X_logs = [0.0] * nbranches
    #result = scipy.optimize.fmin(f, X_logs, full_output=True)
    #print result
    #
    print >> out, '<?xml version="1.0"?>'
    print >> out, '<beast>'
    print >> out
    print >> out, '<!-- actual rate autocorrelation', xycorr, '-->'
    print >> out, '<!-- actual root height', v_to_age[r], '-->'
    print >> out, '<!-- actual log likelihood', ll, '-->'
    print >> out, '<!-- ll if rates were unity', ll_unity, '-->'
    print >> out
    print >> out, '<!--'
    print >> out, 'predefine the taxa as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format'
    print >> out, '-->'
    print >> out, get_leaf_taxon_defn(list(string.uppercase[:nleaves]))
    print >> out
    print >> out, '<!--'
    print >> out, 'define the alignment as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format'
    print >> out, '-->'
    print >> out, get_alignment_defn(leaves, N, v_to_seq)
    print >> out
    print >> out, '<!--'
    print >> out, 'specify the starting tree as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4'
    print >> out, '-->'
    print >> out, get_starting_tree_defn(R, B, N_leaves)
    print >> out
    print >> out, '<!--'
    print >> out, 'connect the tree model as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4'
    print >> out, '-->'
    print >> out, g_tree_model_defn
    print >> out
    print >> out, g_uncorrelated_relaxed_clock_info
    print >> out
    """
    print >> out, '<!--'
    print >> out, 'create a list of taxa for which to constrain the mrca as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1'
    print >> out, '-->'
    for v, leaves in sorted(v_to_leaves.items()):
        if len(leaves) > 1:
            print >> out, get_mrca_subset_defn(N, v, leaves)
    print >> out
    print >> out, '<!--'
    print >> out, 'create a tmrcaStatistic that will record the height as in'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1'
    print >> out, '-->'
    for v, leaves in sorted(v_to_leaves.items()):
        if len(leaves) > 1:
            print >> out, get_mrca_stat_defn(N[v])
    """
    print >> out
    print >> out, g_likelihood_info
    print >> out
    print >> out, '<!--'
    print >> out, 'run the mcmc'
    print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1'
    print >> out, '-->'
    print >> out, get_mcmc_defn(v_to_leaves, v_to_age, N)
    print >> out
    print >> out, '</beast>'
    # return the response
    return out.getvalue()
Ejemplo n.º 39
0
#!/usr/bin/env python

"""
fastaSplit.py <fasta file> <output dir>
"""

import sys
import Fasta


if '-h' in sys.argv or '--help' in sys.argv or len(sys.argv)==1:
    sys.exit(__doc__)

Fasta.split(sys.argv[1], oDir=sys.argv[2])
Ejemplo n.º 40
0
        codonMapFile = sys.argv[1]
else:
    workDir = "/home/ashis/work/github/courses/JHU_Computational_Genomics/HW3"
    os.chdir(workDir)
    sys.path.append(os.path.abspath(os.getcwd()))
    inputFilePath = "data/rosalind_splc.txt"
    inFile = open(inputFilePath, "r")
    codonMapFile = "inputs/rna-codon.txt"

import Dna
import Rna
import Fasta

# read inputs

lines = Fasta.parse_fasta(inFile)
inFile.close()

dna = lines[0]
introns = lines[1:]

# splice dna
re_pattern = "|".join(introns)
splicedDna = re.sub(re_pattern, "", dna, 0)
sdna = Dna.Dna(splicedDna)

# transcribe and translate
rna = Rna.Rna(sdna.transcribe(), codonMapFile=codonMapFile)
protein = rna.translate()
print(protein)
Ejemplo n.º 41
0
#!/usr/bin/env python

"""
reverse_comp.py <filename>

Prints the reverse complement of a DNA string (in Fasta format).
"""

import sys

import Fasta
import Sequence


if len(sys.argv) != 2 or "-h" in sys.argv or "--help" in sys.argv:
    sys.exit(__doc__)

iFilename = sys.argv[1]
header, seq = Fasta.load(iFilename)

seq = Sequence.reverse_complement(seq.upper())

print ">%s" % header
for i in xrange(0, len(seq), 80):
    print seq[i : i + 80]
Ejemplo n.º 42
0
#!/usr/bin/env python

"""
fastaLength.py <input filename>
"""

import sys
import Fasta


if len(sys.argv)==1 or '-h' in sys.argv or '--help' in sys.argv:
    sys.exit(__doc__)

faFile = Fasta.load_mfa_iter(sys.argv[1])

for h,s in faFile:
    print h, len(s)
Ejemplo n.º 43
0
#!/usr/bin/env python

"""
translate.py <filename>

Translates a DNA sequence to a protein sequence
"""

import sys

import Fasta
import Sequence


if len(sys.argv)!=2 or '-h' in sys.argv or '--help' in sys.argv:
    sys.exit(__doc__)

w = 60

iFilename = sys.argv[1]
faFile = Fasta.load_mfa_iter(iFilename)
for header,seq in faFile:
    protein = Sequence.translate(seq)

    print '>%s' % header
    for i in xrange(0, len(protein), w):
        print protein[i:i+w]
Ejemplo n.º 44
0
import re

if __name__ == '__main__':
    #print(sys.argv)
    inFile = sys.stdin
else:
    workDir = '/home/ashis/work/github/courses/JHU_Computational_Genomics/HW3'
    os.chdir(workDir)
    sys.path.append(os.path.abspath(os.getcwd()))
    inputFilePath = 'data/6_edit_alignment_1.txt'
    inFile = open(inputFilePath, 'r')

import Fasta

# read inputs
proteins = Fasta.parse_fasta(inFile)
inFile.close()

protein1 = proteins[0]
protein2 = proteins[1]

sigma = 1

## create scores and backtrack arrays
len1 = len(protein1)
len2 = len(protein2)
scores = [[0]*(len2+1) for x in range(len1+1)]
backtracks = [[-1]*(len2+1) for x in range(len1+1)]

## put first row and first column of scores and backtrack arrays
for i in range(1,len1+1):
Ejemplo n.º 45
0
#!/usr/bin/env python

import re
import Fasta


classes = {
    'Extended B': (14850000, 15460000),
    'Class I-II': (15460000, 17100000),
    'Class III': (17100000, 17950000),
    'Framework': (17950000, 19265000),
    'Extended A': (19265000, 19400000)
}

header,seq = Fasta.load('scaffold_42.fa')

for name,(start,end) in classes.items():
    name = '_'.join(name.split())
    new_header = '%s.%s-%s' % (header, start, end)
    Fasta.save('%s.fa' % name, new_header, seq[start-1:end])
    
Ejemplo n.º 46
0
def get_header_seq_pairs():
    lines = g_fasta_string.splitlines()
    return list(Fasta.gen_header_sequence_pairs(lines))