Example #1
0
def parse_file(file_name, type = 'DNA'):
    """Parse the given file into a FastaAlignment object.

    Arguments:
    o file_name - The location of the file to parse.
    o type - The type of information contained in the file.
    """
    if type.upper() == 'DNA':
        alphabet = IUPAC.ambiguous_dna
    elif type.upper() == 'RNA':
        alphabet = IUPAC.ambiguous_rna
    elif type.upper() == 'PROTEIN':
        alphabet = IUPAC.protein
    else:
        raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN"
                         % type)

    # create a new alignment object
    fasta_align = FastaAlignment(Alphabet.Gapped(alphabet))

    # now parse the file and fill up the alignment object
    align_file = open(file_name, 'r')

    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(align_file, parser)

    cur_align = iterator.next()
    while cur_align:
        fasta_align.add_sequence(cur_align.title, cur_align.sequence)

        cur_align = iterator.next()

    return fasta_align
Example #2
0
    def test_schema_representation(self):
        """Convert sequences into schema representations.
        """
        # get a set of schemas we want to code the sequence in
        schema_bank = self._load_schema_repository()
        top_schemas = schema_bank.get_top(25)
        schema_coder = Schema.SchemaCoder(top_schemas, self.schema)

        # get the sequences one at a time, and encode them
        fasta_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(fasta_handle, seq_parser)

        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            schema_values = schema_coder.representation(seq_record.seq)
            if VERBOSE:
                print "Schema values:", schema_values

        fasta_handle.close()
Example #3
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()

                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.motif_finder = Motif.MotifFinder()
Example #4
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()

                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.num_schemas = 2
        schema_ga = Schema.GeneticAlgorithmFinder()
        schema_ga.min_generations = 1
        self.finder = Schema.SchemaFinder(num_schemas=self.num_schemas,
                                          schema_finder=schema_ga)
Example #5
0
 def test_record_iterator(self):
     """Test the iterator with a Record Parser.
     """
     parser = Fasta.RecordParser()
     iterator = Fasta.Iterator(self.test_handle, parser)
     for rec in iter(iterator):
         assert isinstance(rec, Fasta.Record)
Example #6
0
 def test_sequence_iterator(self):
     """Test the iterator with a Sequence Parser.
     """
     parser = Fasta.SequenceParser()
     iterator = Fasta.Iterator(self.test_handle, parser)
     for rec in iter(iterator):
         assert isinstance(rec, SeqRecord.SeqRecord)
Example #7
0
 def ReadFile(self):
     self.parser = Fasta.RecordParser()
     self.iter = Fasta.Iterator(handle=open(self.file), parser=self.parser)
     while 1:
         rec = self.iter.next()
         if not rec: break
         self.header = rec.title.split()[0].split(',')[0]
         self.HandleRecord(rec)
Example #8
0
    def read_fasta_file(self, file):
        genes = []
        iter = Fasta.Iterator(handle = open(file), parser = Fasta.RecordParser())
        while 1:
            rec = iter.next()
            if not rec: break
            genes.append((rec.sequence, rec.title))

        return genes
Example #9
0
 def test_parsing_comments(self):
     """Parse FASTA files with # style comment lines in them.
     """
     handle = open(os.path.join("Fasta", "f003"))
     iterator = Fasta.Iterator(handle, Fasta.RecordParser())
     num_recs = 0
     for rec in iter(iterator):
         num_recs += 1
     assert num_recs == 2
Example #10
0
def runDisEMBLpipeline():
    try:
        smooth_frame = 8
        peak_frame = 8
        join_frame = 4
        fold_coils = 1.2
        fold_hotloops = 1.4
        fold_rem465 = 1.2
        mode = 'scores'
        try:
            file = open(sys.argv[1], 'r')
        except:
            mode = 'default'
    except:
        print '\nDisEMBL.py sequence_file \n'
        print 'A default run would be: ./DisEMBL.py fasta_file'
        raise SystemExit
    #db = sys.stdin
    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(file, parser)
    while 1:
        try:
            cur_record = iterator.next()
            sequence = upper(cur_record.sequence)
            # Run NN
            COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
            # Run Savitzky-Golay
            REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw)
            COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw)
            HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw)

            sys.stdout.write('> ' + cur_record.title + '\n')
            sys.stdout.write('# COILS ')
            reportSlicesTXT(
                getSlices(COILS_smooth, fold_coils, join_frame, peak_frame,
                          0.43), sequence)
            sys.stdout.write('# REM465 ')
            reportSlicesTXT(
                getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame,
                          0.50), sequence)
            sys.stdout.write('# HOTLOOPS ')
            reportSlicesTXT(
                getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame,
                          peak_frame, 0.086), sequence)
            sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
            for i in range(len(REM465_smooth)):
                sys.stdout.write(sequence[i] + '\t' +
                                 fpformat.fix(COILS_smooth[i], 5) + '\t' +
                                 fpformat.fix(REM465_smooth[i], 5) + '\t' +
                                 fpformat.fix(HOTLOOPS_smooth[i], 5) + '\n')
        except AttributeError:
            break
    file.close()
    return
Example #11
0
 def test_sequence_alphabet(self):
     """Setting the alphabet for the Sequence Parser.
     """
     parser = Fasta.SequenceParser(alphabet =
             IUPAC.unambiguous_dna)
     rec = parser.parse(self.handles[0])
     assert rec.seq.alphabet == IUPAC.unambiguous_dna
Example #12
0
 def test_new_iterator(self):
     """Ensure the Fasta iterator works like a Python 2.2 iterator.
     """
     n = 0
     iterator = Fasta.Iterator(self.test_handle)
     for rec in iter(iterator):
         n += 1
     assert n == 3
Example #13
0
 def test_record_parser(self):
     """Basic operation of the Record Parser.
     """
     parser = Fasta.RecordParser()
     for index in range(len(self.handles)):
         handle = self.handles[index]
         rec = parser.parse(handle)
         assert isinstance(rec, Fasta.Record)
         assert len(rec.title) == self.lengths[index][0]
         assert len(rec.sequence) == self.lengths[index][1]
Example #14
0
 def test_sequence_title_convert(self):
     """Test title conversion for the Sequence Parser.
     """
     def test_title2ids(title):
         return "id", "name", "description"
     parser = Fasta.SequenceParser(title2ids = test_title2ids)
     rec = parser.parse(self.handles[0])
     assert rec.id == "id"
     assert rec.name == "name"
     assert rec.description == "description"
Example #15
0
def runDisEMBLpipeline():
    try:
        smooth_frame = int(sys.argv[1])
        peak_frame = int(sys.argv[2])
        join_frame = int(sys.argv[3])
        fold_coils = float(sys.argv[4])
        fold_hotloops = float(sys.argv[5])
        fold_rem465 = float(sys.argv[6])
        file = str(sys.argv[7])
    except:
        print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file\n'
        print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2  fasta_file'
        raise SystemExit
    db = open(file, 'r')
    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(db, parser)
    while 1:
        try:
            cur_record = iterator.next()
            sequence = upper(cur_record.sequence)
            # Run NN
            COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
            # Run Savitzky-Golay
            REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw)
            COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw)
            HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw)
            sys.stdout.write('> ' + cur_record.title + '_COILS ')
            reportSlicesTXT(
                getSlices(COILS_smooth, fold_coils, join_frame, peak_frame,
                          0.43), sequence)
            sys.stdout.write('> ' + cur_record.title + '_REM465 ')
            reportSlicesTXT(
                getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame,
                          0.50), sequence)
            sys.stdout.write('> ' + cur_record.title + '_HOTLOOPS ')
            reportSlicesTXT(
                getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame,
                          peak_frame, 0.086), sequence)
            sys.stdout.write('\n')
        except AttributeError:
            break
    return
Example #16
0
    def test_record_basic(self):
        """Basic test on Record
        """
        def pbool(b):
            if b:
                return 1
            return 0

        r = Fasta.Record()
        assert pbool(type(r.title) is StringType)    # StringType
        assert pbool(type(r.sequence) is StringType) # StringType
Example #17
0
    def _load_schema_repository(self):
        """Helper function to load a schema repository from a file.

        This also caches a schema bank, to prevent having to do this
        time consuming operation multiple times.
        """
        # if we already have a cached repository, return it
        if self.schema_bank is not None:
            return self.schema_bank

        # otherwise, we'll read in a new schema bank

        # read in the all of the motif records
        motif_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(motif_handle, seq_parser)

        seq_records = []
        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            seq_records.append(seq_record)

        motif_handle.close()

        # find motifs from the file
        motif_finder = Motif.MotifFinder()
        motif_size = 9

        motif_bank = motif_finder.find(seq_records, motif_size)

        schema_bank = self.factory.from_motifs(motif_bank, .1, 2)

        # cache the repository
        self.schema_bank = schema_bank

        return schema_bank
Example #18
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')

        self.test_records = []

        # load the records
        handle = open(test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(handle, seq_parser)
        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            self.test_records.append(seq_record)

        handle.close()

        self.sig_finder = Signature.SignatureFinder()
Example #19
0
def main():
    # create a substitution matrix
    sub_matrix = SubstitutionMatrix('blosum50')
    
    # set up for alignment
    aligner = NWAlign(sub_matrix)
    print "Testing a simple alignment..."
    seq1 = "HEAGAWGHEE"
    seq2 = "PAWHEAE"
    
    aligner.align(seq1, seq2)
    
    align1, align2 = aligner.get_optimal_alignment()
    score = aligner.get_optimal_score()
    
    print "Alignment Score:", score
    print align1.data
    print align2.data
    
    print "Testing a more complex alignment..."
    test_file = "PEPCarboxylase.fasta"
    
    print "Getting sequences from the file PEPCarboxylase.fasta..."
    seq_list = []
    scanner = Fasta._Scanner()
    handler = FASTAHandler(seq_list)
    file = open(test_file, 'r')
    scanner.feed(file, handler)
    scanner.feed(file, handler)
    #print seq_list
    
    print "Aligning sequences..."
    aligner = NWAlign(sub_matrix)
    aligner.align(seq_list[0][0:150], seq_list[1][0:150])
    
    align1, align2 = aligner.get_optimal_alignment()
    score = aligner.get_optimal_score()
    
    print "Alignment Score:", score
    line_width = 25
    current_position = 0
    current_position = current_position + line_width
    # pretty print the alignment
    while current_position < len(align1):
        print ""
        print align1.data[current_position - line_width:current_position]
        print align2.data[current_position - line_width:current_position]
        current_position = current_position + line_width
        
    # print whatever is left
    print ""
    print align1.data[current_position - line_width:len(align1) - 1]
    print align2.data[current_position - line_width:len(align2) - 1]
Example #20
0
    def __str__(self):
        """Print out a fasta version of the alignment info."""
        return_string = ''
        for item in self._records:
            new_f_record = Fasta.Record()
            new_f_record.title = item.description
            new_f_record.sequence = item.seq.data

            return_string = return_string + str(new_f_record) + os.linesep + os.linesep

        # have a extra newline, so strip two off and add one before returning
        return return_string.rstrip() + os.linesep
def extract_organisms(file, num_records):
    scanner = Fasta._Scanner()
    consumer = SpeciesExtractor()

    file_to_parse = UndoHandle(open(file, "r"))

    for fasta_record in range(num_records):
        scanner.feed(file_to_parse, consumer)

    file_to_parse.close()

    return consumer.species_list
Example #22
0
 def test_sequence_parser(self):
     """Basic operation of the Sequence Parser.
     """
     parser = Fasta.SequenceParser()
     for index in range(len(self.handles)):
         handle = self.handles[index]
         rec = parser.parse(handle)
         assert isinstance(rec, SeqRecord.SeqRecord)
         assert isinstance(rec.seq, Seq.Seq)
         assert rec.seq.alphabet == Alphabet.generic_alphabet
         assert len(rec.seq) == self.lengths[index][1]
         assert len(rec.description) == self.lengths[index][0]
Example #23
0
def extract_organisms(file_to_parse):
    # set up the parser and iterator
    parser = Fasta.RecordParser()
    file = open(file_to_parse, 'r')
    iterator = Fasta.Iterator(file, parser)

    all_species = []

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        # extract the info from the title
        new_species = cur_record.title.split()[1]

        # append the new species to the list if it isn't there
        if new_species not in all_species:
            all_species.append(new_species)

    return all_species
def get_seqs(blastRootDirectory):

    if len(sys.argv) >= 2:
        numSeqs = int(sys.argv[1])
        if numSeqs < 0 or numSeqs > 100000:
            print 'requested number of sequences is outside allowable range (1-100000). Using default (1000)'
            numSeqs = 10
    else:
        numSeqs = 10
    print 'requesting', numSeqs, 'query sequences from the server'
    seqs = phamServer.request_seqs(server, numSeqs, client)
    '''Builds the file to be blasted from the sequences given'''

    f = open(os.path.join(blastRootDirectory, 'filetoblast.txt'), 'w')

    print seqs
    '''takes the new set of sequences and checks if they exist in the local database
  and, if so, writes the sequence id and translation to a separate FASTA formated input
  file to be passed to the BLASTALL executable'''

    for GeneID in seqs:
        parser = Fasta.RecordParser()

        infile = open(os.path.join(blastRootDirectory, 'blastDB.fasta'))

        iterator = Fasta.Iterator(infile, parser)
        while 1:
            record = iterator.next()
            if not record:
                break
            record_id = record.title

            if GeneID == record_id:
                f.write('>' + record.title + '\n' + record.sequence + '\n')

    f.close()
    return (len(seqs))
Example #25
0
    def test_basic_iterator(self):
        """Ensure the Fasta iterator works returning text.
        """
        i = Fasta.Iterator(self.test_handle)
        rec_info = {0 : ">gi|1348912|gb|G26680|G26680",
                    1 : ">gi|1348917|gb|G26685|G26685",
                    2 : ">gi|1592936|gb|G29385|G29385"}
        for rec_num in range(3):
            rec = i.next()
            lines = rec.split("\n")
            title_part = lines[0].split()
            assert title_part[0] == rec_info[rec_num]

        # make sure we keep getting None when the iterator is done
        assert i.next() is None
        assert i.next() is None
Example #26
0
 def __init__(self,**kwargs):
     self.db_dir = DEFAULT_DB_DIR
     self.index_filename = DEFAULT_INDEX_FILENAME
     self.seqres_filename = DEFAULT_SEQRES_FILENAME
     for key,value in kwargs:
         if key in ['db_dir']:
             self.db_path = arg
         elif key in ['index_filename']:
             self.index_filename = arg
         elif key in ['seqres_filename']:
             self.seqres_filename = arg
     self.full_index_filename = os.path.join(self.db_dir,self.index_filename)
     self.full_seqres_filename = os.path.join(self.db_dir,self.seqres_filename)
     self.offsets = {}
     self.namesByPdbid = {}
     self.seqres_file = file(self.full_seqres_filename)
     self.fasta_parser = Fasta.RecordParser()
     self.load_index_file()
Example #27
0
def main(blast_file):
    db_dir = os.path.join(os.getcwd(), "db")
    cur_dbs = get_available_dbs(db_dir)
    length_cutoff = 0.2
    blast_clusters, all_lengths = get_blast_clusters(blast_file, length_cutoff)
    filter_clusters = filter_by_organism(blast_clusters, org_includes, cur_dbs)
    length_plot(all_lengths, blast_file)
    cluster_grouper = SimilarityClusterGrouper(2, 200, [(0.9, 10)])
    all_groups = cluster_grouper.get_final_groups(filter_clusters)
    base, ext = os.path.splitext(blast_file)
    cluster_file = base + "-bcluster%s.txt"
    for gindex, group in enumerate(all_groups):
        print '-----------'
        with open(cluster_file % gindex, "w") as out_handle:
            for gitem in group:
                db_rec = get_db_rec(gitem, cur_dbs)
                print gitem, db_rec["org_scientific_name"]
                rec = Fasta.Record()
                rec.title = gitem
                rec.sequence = db_rec["seq"]
                out_handle.write(str(rec) + "\n")
Example #28
0
# and builds an index as a set of files on disc in the sub-directory
# my_orchid_dict.idx
# Note that the alphabet is explicitly defined for the sequences.

import os
from Bio import Fasta
from Bio.Alphabet import IUPAC

def get_accession_num(fasta_record):
    title_atoms = fasta_record.title.split()
    accession_atoms = title_atoms[0].split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

if not os.path.isdir("my_orchid_dict.idx") :
    #Build a new index
    Fasta.index_file("ls_orchid.fasta", "my_orchid_dict.idx",
                     get_accession_num)
else :
    print "Reusing existing index"

dna_parser = Fasta.SequenceParser(IUPAC.ambiguous_dna)

orchid_dict = Fasta.Dictionary("my_orchid_dict.idx", dna_parser)

for id_num in orchid_dict.keys():
    print 'id number:', id_num
    print 'description:', orchid_dict[id_num].description
    print 'sequence:', orchid_dict[id_num].seq
Example #29
0
    else:
        return open(file_name, 'r')


if __name__ == "__main__":

    import getopt

    opts, args = getopt.getopt(sys.argv[1:], 'hs:t:')

    if not opts or len(args) != 1:
        usage()
        sys.exit('Error usage')

    fasta_file = open(args[0])
    parser = Fasta.RecordParser()

    for o, a in opts:
        if o == '-h':
            usage()
            sys.exit(0)
        elif o == '-s':
            sieve = get_sieve(get_input_handle(a))
            iterator = FastaSelectiveIterator(sieve, fasta_file, parser)
            for record in iterator:
                print record
        elif o == '-t':
            translator = FastaTranslator(get_input_handle(a), reverse=True)
            iterator = Fasta.Iterator(fasta_file, parser)
            for record in iterator:
                print translator(record)
Example #30
0
#!/usr/bin/env python
"""Example showing how to deal with internet BLAST from Biopython.

This code is described in great detail in the BLAST section of the Biopython
documentation.
"""
# standard library
import cStringIO

# biopython
from Bio.Blast import NCBIWWW
from Bio import Fasta

# first get the sequence we want to parse from a FASTA file
file_for_blast = open('m_cold.fasta', 'r')
f_iterator = Fasta.Iterator(file_for_blast)

f_record = f_iterator.next()

print 'Doing the BLAST and retrieving the results...'
result_handle = NCBIWWW.qblast('blastn', 'nr', f_record)

# save the results for later, in case we want to look at it
save_file = open('m_cold_blast.out', 'w')
blast_results = result_handle.read()
save_file.write(blast_results)
save_file.close()

print 'Parsing the results and extracting info...'
b_parser = NCBIWWW.BlastParser()
Example #31
0
def main(ipr_number, num_clusters, out_dir):
    charge_window = 75
    db_dir = os.path.join(os.getcwd(), "db")
    cur_db = shelve.open(os.path.join(db_dir, ipr_number))
    tax_graph = build_tax_graph(cur_db)
    uniprot_ids = []
    info_array = []
    for db_domain in cur_db.keys():
        if not cur_db[db_domain].get("is_uniref_child", ""):
            uniprot_ids.append(db_domain)
            db_item = cur_db[db_domain]
            cur_cluster_info = [
                float(db_item["charge"]),
                float(db_item["charge_region"]) * 10.0,
                len(db_item.get("db_refs", [])) * 5.0,
                calc_domain_distance(db_item) * 50.0,
                #max(len(db_item.get("string_interactors", [])) - 1, 0),
            ]
            info_array.append(cur_cluster_info)
    info_array = numpy.array(info_array)
    print 'Num genes', len(info_array), num_clusters
    cluster_ids, error, nfound = Cluster.kcluster(
        info_array, nclusters=num_clusters, npass=50)  #, method='a', dist='c')
    #tree = Cluster.treecluster(info_array, method='a', dist='c')
    #cluster_ids = tree.cut(num_clusters)
    cluster_dict = collections.defaultdict(lambda: [])
    for i, cluster_id in enumerate(cluster_ids):
        cluster_dict[cluster_id].append(uniprot_ids[i])
    out_seq_file = os.path.join(out_dir, "%s-seqs.fa" % (ipr_number))
    out_seq_handle = open(out_seq_file, "w")
    for index, cluster_group in enumerate(cluster_dict.values()):
        print '***********', index
        org_dists = []
        for uniprot_id in cluster_group:
            org = cur_db[uniprot_id]["org_scientific_name"]
            distance = networkx.dijkstra_path_length(tax_graph, 'Mus musculus',
                                                     org)
            org_dists.append((distance, org, uniprot_id))
        org_dists.sort()
        members = []
        for d, o, u in org_dists:
            charge_plot_img = calc_charge_plot(u, cur_db[u], charge_window,
                                               out_dir)
            base, ext = os.path.splitext(charge_plot_img)
            disorder_plot_img = "%s-idr%s" % (base, ext)
            rec = Fasta.Record()
            rec.title = u
            rec.sequence = cur_db[u]["seq"]
            out_seq_handle.write(str(rec) + "\n")
            members.append(
                dict(
                    organism=o,
                    uniprot_id=get_uniprot_links([u]),
                    alt_names=get_alt_names(cur_db[u]),
                    alt_ids=get_uniprot_links(cur_db[u].get(
                        "uniref_children", [])),
                    charge=cur_db[u]["charge"],
                    charge_region="%0.2f" % cur_db[u]["charge_region"],
                    charge_plot_img=charge_plot_img,
                    disorder_plot_img=disorder_plot_img,
                    domains=len(cur_db[u].get("db_refs", [])),
                    interactions=get_string_link(
                        u,
                        max(
                            len(cur_db[u].get("string_interactors", [])) - 1,
                            0)),
                    description=cur_db[u].get("function_descr", "&nbsp;"),
                    c_distance="%0.2f" % calc_domain_distance(cur_db[u]),
                ))
        with open(
                os.path.join(out_dir,
                             "%s-cluster%s.html" % (ipr_number, index)),
                "w") as out_handle:
            tmpl = Template(cluster_template)
            out_handle.write(tmpl.render(cluster_members=members))
Example #32
0
def initialize():

    """
    Parse command line options, and read input Fasta file.

    Construct a dictionary contains the following fields:

        sequences         a list of dictionary objects having 'title',
                          'sequence', and 'motif_position' attributes (see
                          also the docstring of gibbs.Gibbs.__init__)
        width             width of motif to find
        weight            weight to use for pseudocounts
        iterations        number of non-improving iterations before stopping
        shifts            maximum phase shifts to detect
        ps_freq           frequency of detecting phase shifts
        init_occurrences  number of base occurrences to use for initial motif
                          positions heuristic
        init_width        width of patterns to use for initial motif positions
                          heuristic

    Return the constructed dictionary.
    """

    parser = OptionParser(usage = "usage: %prog -i FILE -w WIDTH [-h] "
                          "[options]",
                          version = "PyMotif %s (%s)" % (VERSION, DATE),
                          description = "PyMotif is an implementation of the "
                          "Gibbs sampling algorithm for finding local "
                          "alignments of DNA sequences. "
                          "See the accompanied README file for usage "
                          "instructions and the documentation directory for "
                          "implementation details.")

    parser.add_option("-i", "--input", dest="input", metavar="FILE",
                      help="read FILE in Fasta format")
    parser.add_option("-w", "--width", dest="width", metavar="WIDTH",
                      type="int", help="find motif of width WIDTH")
    parser.add_option("-t", "--iterations", dest="iterations",
                      metavar="ITERATIONS", default=ITERATIONS_DEFAULT,
                      type="int", help="number of non-improving iterations "
                      "(default " + str(ITERATIONS_DEFAULT) + ")")
    parser.add_option("-p", "--pseudo", dest="pseudo", metavar="WEIGHT",
                      default=PSEUDOCOUNTS_WEIGHT_DEFAULT, type="float",
                      help="use WEIGHT for weight of pseudocounts (default " +
                      str(PSEUDOCOUNTS_WEIGHT_DEFAULT) + ")")
    parser.add_option("-s", "--phase-shifts", dest="shifts", metavar="SHIFTS",
                      default=PHASE_SHIFTS_DEFAULT, type="int",
                      help="detect phase shifts of width SHIFTS (default " +
                      str(PHASE_SHIFTS_DEFAULT) + ")")
    parser.add_option("-f", "--ps-frequency", dest="frequency",
                      metavar="FREQ", default=PS_FREQUENCY_DEFAULT,
                      type="int", help="if SHIFTS>0, detect phase shifts "
                      "every FREQ iterations (default " +
                      str(PS_FREQUENCY_DEFAULT) + ")")
    parser.add_option("-n", "--init-num-occurrences", dest="initoccurrences",
                      metavar="OCCURRENCES",
                      default=INIT_NUM_OCCURRENCES_DEFAULT, type="int",
                      help="number of base occurrences to use for initial "
                      "positions heuristic (default " +
                      str(INIT_NUM_OCCURRENCES_DEFAULT) + ")")
    parser.add_option("-v", "--init-pattern-width", dest="initwidth",
                      metavar="WIDTH", default=INIT_PATTERN_WIDTH_DEFAULT,
                      type="int", help="if OCCURRENCES>0, width of pattern "
                      "to use for initial positions heuristic (defaults to "
                      "value of --width)")
    parser.add_option("-c", "--cow", action="store_true", dest="cow",
                      default=False, help="display cow (not recommended)")

    (options, args) = parser.parse_args()

    if options.cow:
        s = ""
        for _ in range(10):
            s += choice("ATCG")
        # Created with the cowsay program
        print """ ____________
< %s >
 ------------
        \   ^__^
         \  (oo)\_______
            (__)\       )\/\\
                ||----w |
                ||     ||""" % s
        sys.exit(0)

    if not options.input:
        parser.error("input file required")

    if not options.width:
        parser.error("width argument required")

    if options.width < 2:
        parser.error("please use a sane motif width")

    # Read contents of Fasta file
    try:
        file = open(options.input)
    except IOError:
        parser.error("could not read file %s" % options.input)

    fasta_parser = Fasta.RecordParser()

    # Iterator for sample data
    fasta_iterator = Fasta.Iterator(file, fasta_parser)

    # A list containing a dictionary object for each sequence
    sequences = [{'title':          record.title,
                  'sequence':       record.sequence,
                  'motif_position': 0}
                 for record in fasta_iterator]

    # We could do some more error checking on the input file here, like
    # checking there's only ATCG and at least a few of them, but for now
    # this is enough
    if len(sequences) < 2:
        parser.error("found %i sequences in input file %s" % (len(sequences),
                                                              options.input))

    return {'sequences':        sequences,
            'width':            options.width,
            'weight':           options.pseudo,
            'iterations':       options.iterations,
            'shifts':           options.shifts,
            'ps_freq':          options.frequency,
            'init_occurrences': options.initoccurrences,
            'init_width':       options.initwidth}
Example #33
0
#! /usr/bin/env python

import sys, os
import time
from Bio import Fasta

DEFAULT_DICT_FILE = '/project1/structure/mliang/pdb/derived_data/pdb_seqres.idx'
DEFAULT_OUTFH = sys.stdout

dict_file = DEFAULT_DICT_FILE
outfh = DEFAULT_OUTFH

start_time = time.time()
fdict = Fasta.Dictionary(dict_file)
elapse_time = time.time() - start_time
print >> sys.stderr, "Time to load dictionary:", elapse_time

start_time = time.time()
chainmap = {}
for key in fdict.keys():
    chainmap.setdefault(key[:4], []).append(key)
elapse_time = time.time() - start_time
print >> sys.stderr, "Time to build chain map:", elapse_time

start_time = time.time()
args = sys.argv[1:]
if not args:
    args = sys.stdin

for field in args:
    fields = field.strip().split()