Exemple #1
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()

                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.motif_finder = Motif.MotifFinder()
Exemple #2
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()

                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.num_schemas = 2
        schema_ga = Schema.GeneticAlgorithmFinder()
        schema_ga.min_generations = 1
        self.finder = Schema.SchemaFinder(num_schemas=self.num_schemas,
                                          schema_finder=schema_ga)
Exemple #3
0
def parse_file(file_name, type = 'DNA'):
    """Parse the given file into a FastaAlignment object.

    Arguments:
    o file_name - The location of the file to parse.
    o type - The type of information contained in the file.
    """
    if type.upper() == 'DNA':
        alphabet = IUPAC.ambiguous_dna
    elif type.upper() == 'RNA':
        alphabet = IUPAC.ambiguous_rna
    elif type.upper() == 'PROTEIN':
        alphabet = IUPAC.protein
    else:
        raise ValueError("Invalid type %s passed. Need DNA, RNA or PROTEIN"
                         % type)

    # create a new alignment object
    fasta_align = FastaAlignment(Alphabet.Gapped(alphabet))

    # now parse the file and fill up the alignment object
    align_file = open(file_name, 'r')

    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(align_file, parser)

    cur_align = iterator.next()
    while cur_align:
        fasta_align.add_sequence(cur_align.title, cur_align.sequence)

        cur_align = iterator.next()

    return fasta_align
Exemple #4
0
 def test_record_iterator(self):
     """Test the iterator with a Record Parser.
     """
     parser = Fasta.RecordParser()
     iterator = Fasta.Iterator(self.test_handle, parser)
     for rec in iter(iterator):
         assert isinstance(rec, Fasta.Record)
Exemple #5
0
 def test_sequence_iterator(self):
     """Test the iterator with a Sequence Parser.
     """
     parser = Fasta.SequenceParser()
     iterator = Fasta.Iterator(self.test_handle, parser)
     for rec in iter(iterator):
         assert isinstance(rec, SeqRecord.SeqRecord)
Exemple #6
0
    def test_schema_representation(self):
        """Convert sequences into schema representations.
        """
        # get a set of schemas we want to code the sequence in
        schema_bank = self._load_schema_repository()
        top_schemas = schema_bank.get_top(25)
        schema_coder = Schema.SchemaCoder(top_schemas, self.schema)

        # get the sequences one at a time, and encode them
        fasta_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(fasta_handle, seq_parser)

        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            schema_values = schema_coder.representation(seq_record.seq)
            if VERBOSE:
                print "Schema values:", schema_values

        fasta_handle.close()
Exemple #7
0
 def ReadFile(self):
     self.parser = Fasta.RecordParser()
     self.iter = Fasta.Iterator(handle=open(self.file), parser=self.parser)
     while 1:
         rec = self.iter.next()
         if not rec: break
         self.header = rec.title.split()[0].split(',')[0]
         self.HandleRecord(rec)
Exemple #8
0
 def test_new_iterator(self):
     """Ensure the Fasta iterator works like a Python 2.2 iterator.
     """
     n = 0
     iterator = Fasta.Iterator(self.test_handle)
     for rec in iter(iterator):
         n += 1
     assert n == 3
Exemple #9
0
 def test_parsing_comments(self):
     """Parse FASTA files with # style comment lines in them.
     """
     handle = open(os.path.join("Fasta", "f003"))
     iterator = Fasta.Iterator(handle, Fasta.RecordParser())
     num_recs = 0
     for rec in iter(iterator):
         num_recs += 1
     assert num_recs == 2
Exemple #10
0
    def read_fasta_file(self, file):
        genes = []
        iter = Fasta.Iterator(handle = open(file), parser = Fasta.RecordParser())
        while 1:
            rec = iter.next()
            if not rec: break
            genes.append((rec.sequence, rec.title))

        return genes
Exemple #11
0
def runDisEMBLpipeline():
    try:
        smooth_frame = 8
        peak_frame = 8
        join_frame = 4
        fold_coils = 1.2
        fold_hotloops = 1.4
        fold_rem465 = 1.2
        mode = 'scores'
        try:
            file = open(sys.argv[1], 'r')
        except:
            mode = 'default'
    except:
        print '\nDisEMBL.py sequence_file \n'
        print 'A default run would be: ./DisEMBL.py fasta_file'
        raise SystemExit
    #db = sys.stdin
    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(file, parser)
    while 1:
        try:
            cur_record = iterator.next()
            sequence = upper(cur_record.sequence)
            # Run NN
            COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
            # Run Savitzky-Golay
            REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw)
            COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw)
            HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw)

            sys.stdout.write('> ' + cur_record.title + '\n')
            sys.stdout.write('# COILS ')
            reportSlicesTXT(
                getSlices(COILS_smooth, fold_coils, join_frame, peak_frame,
                          0.43), sequence)
            sys.stdout.write('# REM465 ')
            reportSlicesTXT(
                getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame,
                          0.50), sequence)
            sys.stdout.write('# HOTLOOPS ')
            reportSlicesTXT(
                getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame,
                          peak_frame, 0.086), sequence)
            sys.stdout.write('# RESIDUE COILS REM465 HOTLOOPS\n')
            for i in range(len(REM465_smooth)):
                sys.stdout.write(sequence[i] + '\t' +
                                 fpformat.fix(COILS_smooth[i], 5) + '\t' +
                                 fpformat.fix(REM465_smooth[i], 5) + '\t' +
                                 fpformat.fix(HOTLOOPS_smooth[i], 5) + '\n')
        except AttributeError:
            break
    file.close()
    return
Exemple #12
0
    def test_basic_iterator(self):
        """Ensure the Fasta iterator works returning text.
        """
        i = Fasta.Iterator(self.test_handle)
        rec_info = {0 : ">gi|1348912|gb|G26680|G26680",
                    1 : ">gi|1348917|gb|G26685|G26685",
                    2 : ">gi|1592936|gb|G29385|G29385"}
        for rec_num in range(3):
            rec = i.next()
            lines = rec.split("\n")
            title_part = lines[0].split()
            assert title_part[0] == rec_info[rec_num]

        # make sure we keep getting None when the iterator is done
        assert i.next() is None
        assert i.next() is None
Exemple #13
0
def runDisEMBLpipeline():
    try:
        smooth_frame = int(sys.argv[1])
        peak_frame = int(sys.argv[2])
        join_frame = int(sys.argv[3])
        fold_coils = float(sys.argv[4])
        fold_hotloops = float(sys.argv[5])
        fold_rem465 = float(sys.argv[6])
        file = str(sys.argv[7])
    except:
        print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file\n'
        print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2  fasta_file'
        raise SystemExit
    db = open(file, 'r')
    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(db, parser)
    while 1:
        try:
            cur_record = iterator.next()
            sequence = upper(cur_record.sequence)
            # Run NN
            COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
            # Run Savitzky-Golay
            REM465_smooth = SavitzkyGolay(smooth_frame, 0, REM465_raw)
            COILS_smooth = SavitzkyGolay(smooth_frame, 0, COILS_raw)
            HOTLOOPS_smooth = SavitzkyGolay(smooth_frame, 0, HOTLOOPS_raw)
            sys.stdout.write('> ' + cur_record.title + '_COILS ')
            reportSlicesTXT(
                getSlices(COILS_smooth, fold_coils, join_frame, peak_frame,
                          0.43), sequence)
            sys.stdout.write('> ' + cur_record.title + '_REM465 ')
            reportSlicesTXT(
                getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame,
                          0.50), sequence)
            sys.stdout.write('> ' + cur_record.title + '_HOTLOOPS ')
            reportSlicesTXT(
                getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame,
                          peak_frame, 0.086), sequence)
            sys.stdout.write('\n')
        except AttributeError:
            break
    return
Exemple #14
0
    def _load_schema_repository(self):
        """Helper function to load a schema repository from a file.

        This also caches a schema bank, to prevent having to do this
        time consuming operation multiple times.
        """
        # if we already have a cached repository, return it
        if self.schema_bank is not None:
            return self.schema_bank

        # otherwise, we'll read in a new schema bank

        # read in the all of the motif records
        motif_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(motif_handle, seq_parser)

        seq_records = []
        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            seq_records.append(seq_record)

        motif_handle.close()

        # find motifs from the file
        motif_finder = Motif.MotifFinder()
        motif_size = 9

        motif_bank = motif_finder.find(seq_records, motif_size)

        schema_bank = self.factory.from_motifs(motif_bank, .1, 2)

        # cache the repository
        self.schema_bank = schema_bank

        return schema_bank
Exemple #15
0
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')

        self.test_records = []

        # load the records
        handle = open(test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet=IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(handle, seq_parser)
        while 1:
            seq_record = iterator.next()

            if seq_record is None:
                break

            self.test_records.append(seq_record)

        handle.close()

        self.sig_finder = Signature.SignatureFinder()
def extract_organisms(file_to_parse):
    # set up the parser and iterator
    parser = Fasta.RecordParser()
    file = open(file_to_parse, 'r')
    iterator = Fasta.Iterator(file, parser)

    all_species = []

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        # extract the info from the title
        new_species = cur_record.title.split()[1]

        # append the new species to the list if it isn't there
        if new_species not in all_species:
            all_species.append(new_species)

    return all_species
def get_seqs(blastRootDirectory):

    if len(sys.argv) >= 2:
        numSeqs = int(sys.argv[1])
        if numSeqs < 0 or numSeqs > 100000:
            print 'requested number of sequences is outside allowable range (1-100000). Using default (1000)'
            numSeqs = 10
    else:
        numSeqs = 10
    print 'requesting', numSeqs, 'query sequences from the server'
    seqs = phamServer.request_seqs(server, numSeqs, client)
    '''Builds the file to be blasted from the sequences given'''

    f = open(os.path.join(blastRootDirectory, 'filetoblast.txt'), 'w')

    print seqs
    '''takes the new set of sequences and checks if they exist in the local database
  and, if so, writes the sequence id and translation to a separate FASTA formated input
  file to be passed to the BLASTALL executable'''

    for GeneID in seqs:
        parser = Fasta.RecordParser()

        infile = open(os.path.join(blastRootDirectory, 'blastDB.fasta'))

        iterator = Fasta.Iterator(infile, parser)
        while 1:
            record = iterator.next()
            if not record:
                break
            record_id = record.title

            if GeneID == record_id:
                f.write('>' + record.title + '\n' + record.sequence + '\n')

    f.close()
    return (len(seqs))
Exemple #18
0
def runGlobPlot():
    try:
        smoothFrame = 10
        DOM_joinFrame = 15
        DOM_peakFrame = 74
        DIS_joinFrame = 4
        DIS_peakFrame = 5
        file = str(sys.argv[1])
        db = open(file, 'r')
    except:
        print 'Usage:'
        print '         ./GlobPipe.py FASTAfile'
        raise SystemExit
    parser = Fasta.RecordParser()
    iterator = Fasta.Iterator(db, parser)
    while 1:
        try:
            cur_record = iterator.next()
            #uppercase is searchspace
            seq = upper(cur_record.sequence)
            # sum function
            sum_vector = Sum(seq, RL)
            # Run Savitzky-Golay
            smooth = SavitzkyGolay( ` smoothFrame `, 0, sum_vector)
            dydx_vector = SavitzkyGolay( ` smoothFrame `, 1, sum_vector)
            #test
            sumHEAD = sum_vector[:smoothFrame]
            sumTAIL = sum_vector[len(sum_vector) - smoothFrame:]
            newHEAD = []
            newTAIL = []
            for i in range(len(sumHEAD)):
                try:
                    dHEAD = (sumHEAD[i + 1] - sumHEAD[i]) / 2
                except:
                    dHEAD = (sumHEAD[i] - sumHEAD[i - 1]) / 2
                try:
                    dTAIL = (sumTAIL[i + 1] - sumTAIL[i]) / 2
                except:
                    dTAIL = (sumTAIL[i] - sumTAIL[i - 1]) / 2
                newHEAD.append(dHEAD)
                newTAIL.append(dTAIL)
            dydx_vector[:smoothFrame] = newHEAD
            dydx_vector[len(dydx_vector) - smoothFrame:] = newTAIL
            globdoms, globdis = getSlices(dydx_vector, DOM_joinFrame,
                                          DOM_peakFrame, DIS_joinFrame,
                                          DIS_peakFrame)
            s_domMask, coordstrDOM = reportSlicesTXT(globdoms, seq, 'DOM')
            s_final, coordstrDIS = reportSlicesTXT(globdis, s_domMask, 'DIS')
            sys.stdout.write('>' + cur_record.title + '\n')
            sys.stdout.write('# ' + coordstrDOM + '\n')
            sys.stdout.write('# ' + coordstrDIS + '\n')

            # UNCOMMENT THIS IF NEED TO PRODUCE PER RESEDUE VALUES
            sys.stdout.write('# RESIDUE' + '\t' + 'DYDX' + '\t' + 'RAW' +
                             '\t' + 'SMOOTHED\n')
            for i in range(len(dydx_vector)):
                # dydx (positive values seems to indicate disorder in rows more than ~6 chars)  raw    smoothed
                sys.stdout.write(seq[i] + '\t' +
                                 fpformat.fix(dydx_vector[i], 4) + '\t' +
                                 fpformat.fix(smooth[i], 4) + '\t' +
                                 fpformat.fix(sum_vector[i], 4) + '\n')


#            print s_final
            print '\n'
        except AttributeError:
            break
    return
                    filename = "interpro_" + title + ".xml"
                    result = server.poll(jobId, resultType.type)
                    parseResult(result, wh_domain, wh_evidence, title)
        else:
            counter += 1

    return jobIds


jobIds = []
packageFull = False

RH = open(infile, "r")
WH = open(options.outfile, "w")

for prot_record in Fasta.Iterator(RH, Fasta.RecordParser()):
    seqData = ">" + prot_record.title + "\n" + prot_record.sequence
    content = [{'type': 'sequence', 'content': seqData}]
    packageFull = True

    while (packageFull):
        jobIds = checkJobs(jobIds, wh_domain, wh_evidence)

        if (len(jobIds) < 25):
            packageFull = False

        if (packageFull):
            time.sleep(15)

    sys.stderr.write("Submitted protein " + prot_record.title +
                     " to InterProScan...\n")
Exemple #20
0
if __name__ == "__main__":

    import getopt

    opts, args = getopt.getopt(sys.argv[1:], 'hs:t:')

    if not opts or len(args) != 1:
        usage()
        sys.exit('Error usage')

    fasta_file = open(args[0])
    parser = Fasta.RecordParser()

    for o, a in opts:
        if o == '-h':
            usage()
            sys.exit(0)
        elif o == '-s':
            sieve = get_sieve(get_input_handle(a))
            iterator = FastaSelectiveIterator(sieve, fasta_file, parser)
            for record in iterator:
                print record
        elif o == '-t':
            translator = FastaTranslator(get_input_handle(a), reverse=True)
            iterator = Fasta.Iterator(fasta_file, parser)
            for record in iterator:
                print translator(record)
        else:
            usage()
            sys.exit('Error usage')
Exemple #21
0
def initialize():

    """
    Parse command line options, and read input Fasta file.

    Construct a dictionary contains the following fields:

        sequences         a list of dictionary objects having 'title',
                          'sequence', and 'motif_position' attributes (see
                          also the docstring of gibbs.Gibbs.__init__)
        width             width of motif to find
        weight            weight to use for pseudocounts
        iterations        number of non-improving iterations before stopping
        shifts            maximum phase shifts to detect
        ps_freq           frequency of detecting phase shifts
        init_occurrences  number of base occurrences to use for initial motif
                          positions heuristic
        init_width        width of patterns to use for initial motif positions
                          heuristic

    Return the constructed dictionary.
    """

    parser = OptionParser(usage = "usage: %prog -i FILE -w WIDTH [-h] "
                          "[options]",
                          version = "PyMotif %s (%s)" % (VERSION, DATE),
                          description = "PyMotif is an implementation of the "
                          "Gibbs sampling algorithm for finding local "
                          "alignments of DNA sequences. "
                          "See the accompanied README file for usage "
                          "instructions and the documentation directory for "
                          "implementation details.")

    parser.add_option("-i", "--input", dest="input", metavar="FILE",
                      help="read FILE in Fasta format")
    parser.add_option("-w", "--width", dest="width", metavar="WIDTH",
                      type="int", help="find motif of width WIDTH")
    parser.add_option("-t", "--iterations", dest="iterations",
                      metavar="ITERATIONS", default=ITERATIONS_DEFAULT,
                      type="int", help="number of non-improving iterations "
                      "(default " + str(ITERATIONS_DEFAULT) + ")")
    parser.add_option("-p", "--pseudo", dest="pseudo", metavar="WEIGHT",
                      default=PSEUDOCOUNTS_WEIGHT_DEFAULT, type="float",
                      help="use WEIGHT for weight of pseudocounts (default " +
                      str(PSEUDOCOUNTS_WEIGHT_DEFAULT) + ")")
    parser.add_option("-s", "--phase-shifts", dest="shifts", metavar="SHIFTS",
                      default=PHASE_SHIFTS_DEFAULT, type="int",
                      help="detect phase shifts of width SHIFTS (default " +
                      str(PHASE_SHIFTS_DEFAULT) + ")")
    parser.add_option("-f", "--ps-frequency", dest="frequency",
                      metavar="FREQ", default=PS_FREQUENCY_DEFAULT,
                      type="int", help="if SHIFTS>0, detect phase shifts "
                      "every FREQ iterations (default " +
                      str(PS_FREQUENCY_DEFAULT) + ")")
    parser.add_option("-n", "--init-num-occurrences", dest="initoccurrences",
                      metavar="OCCURRENCES",
                      default=INIT_NUM_OCCURRENCES_DEFAULT, type="int",
                      help="number of base occurrences to use for initial "
                      "positions heuristic (default " +
                      str(INIT_NUM_OCCURRENCES_DEFAULT) + ")")
    parser.add_option("-v", "--init-pattern-width", dest="initwidth",
                      metavar="WIDTH", default=INIT_PATTERN_WIDTH_DEFAULT,
                      type="int", help="if OCCURRENCES>0, width of pattern "
                      "to use for initial positions heuristic (defaults to "
                      "value of --width)")
    parser.add_option("-c", "--cow", action="store_true", dest="cow",
                      default=False, help="display cow (not recommended)")

    (options, args) = parser.parse_args()

    if options.cow:
        s = ""
        for _ in range(10):
            s += choice("ATCG")
        # Created with the cowsay program
        print """ ____________
< %s >
 ------------
        \   ^__^
         \  (oo)\_______
            (__)\       )\/\\
                ||----w |
                ||     ||""" % s
        sys.exit(0)

    if not options.input:
        parser.error("input file required")

    if not options.width:
        parser.error("width argument required")

    if options.width < 2:
        parser.error("please use a sane motif width")

    # Read contents of Fasta file
    try:
        file = open(options.input)
    except IOError:
        parser.error("could not read file %s" % options.input)

    fasta_parser = Fasta.RecordParser()

    # Iterator for sample data
    fasta_iterator = Fasta.Iterator(file, fasta_parser)

    # A list containing a dictionary object for each sequence
    sequences = [{'title':          record.title,
                  'sequence':       record.sequence,
                  'motif_position': 0}
                 for record in fasta_iterator]

    # We could do some more error checking on the input file here, like
    # checking there's only ATCG and at least a few of them, but for now
    # this is enough
    if len(sequences) < 2:
        parser.error("found %i sequences in input file %s" % (len(sequences),
                                                              options.input))

    return {'sequences':        sequences,
            'width':            options.width,
            'weight':           options.pseudo,
            'iterations':       options.iterations,
            'shifts':           options.shifts,
            'ps_freq':          options.frequency,
            'init_occurrences': options.initoccurrences,
            'init_width':       options.initwidth}
Exemple #22
0
#!/usr/bin/env python
"""Example showing how to deal with internet BLAST from Biopython.

This code is described in great detail in the BLAST section of the Biopython
documentation.
"""
# standard library
import cStringIO

# biopython
from Bio.Blast import NCBIWWW
from Bio import Fasta

# first get the sequence we want to parse from a FASTA file
file_for_blast = open('m_cold.fasta', 'r')
f_iterator = Fasta.Iterator(file_for_blast)

f_record = f_iterator.next()

print 'Doing the BLAST and retrieving the results...'
result_handle = NCBIWWW.qblast('blastn', 'nr', f_record)

# save the results for later, in case we want to look at it
save_file = open('m_cold_blast.out', 'w')
blast_results = result_handle.read()
save_file.write(blast_results)
save_file.close()

print 'Parsing the results and extracting info...'
b_parser = NCBIWWW.BlastParser()