Esempio n. 1
0
def getDict(fname = 'data/mitochondrion.txt'):
    ids =   ['NC_000845.1',
             'NC_004299.1',
             'AC_000022.2',
             'NC_002083.1',
             'NC_001643.1',
             'NC_011137.1',
             'NC_012920.1',
             'NC_001645.1',
             'NC_002008.4',
             'NC_006580.1',
             'NC_012420.1',
             'NC_011391.1',
             'NC_012061.1',
             'NC_001640.1']
    a = []
    for i in ids:
        r = functions.get_sequence(i)
        d = {   'seq' : r,
                'id' : i,
                'latin' : r.annotations['source'].split('(')[0].replace('mitochondrion','').strip(),
                'english' : r.annotations['source'].split('(')[1].replace(')','').strip(),
                'COX3' : [f.qualifiers['translation'][0] for f in r.features if f.type == 'CDS' and f.qualifiers['gene'][0] == "COX3"][0]
                }

        a.append(d)
    return a
Esempio n. 2
0
                        #AllChains[file][chain][chain_object_id] =  functions.get_backbone_atoms_nucleicacids(chaincheck) <<< incase i need this (superimposer does it)
                    i += 1
""" in order to avoid calculating the sequences multiple times
I am just going place the sequences and their identifiers in a list of tuples to be used
in the Alignment"""

if options.verbose:
    print("Saving sequences...\n\n")

forAlignmentlist = []
for file in AllChains:
    i = 0
    for chain in AllChains[file]:
        for new_id in AllChains[file][chain]:
            addtuple = (file + "_" + chain,
                        functions.get_sequence(AllChains[file][chain][new_id]))
            forAlignmentlist.append(addtuple)
            i += 1

#############                            #############
#############   Dealing with H**o-mers   #############
#############                            #############
""" If the protein is a H**o-mer we will only have a single file to deal with 
in that case the user has to provide us with an integer of the amount of repeating chains
of that homomer in the end structure."""

###
### The method of growing the model here is the same as the original which will occur
### Further on (if no stoichiometry was provided). The process is explained there
###
Esempio n. 3
0
    l = sorted([(f.location.end.position - f.location.start.position) / 3 for f in cds])
    print "number of orfs: %5d" % (len(cds))
    print "average length: %7.2f" % (float(sum(l)) / len(l))
    print "mean length:    %5d" % (l[len(l) / 2])

if __name__ == "__main__":
        
    seq_id = 'NC_006058'
    
    start_c = {'NC_006058' : set(["ATG"]),
               'NC_007346' : set(["ATG", "TTG", "CTG"]) }
    stop_c = {'NC_006058' : set(["TGA"]),
              'NC_007346' : set(["TAA", "TAG", "TGA"]) }
    
    
    rec = functions.get_sequence(seq_id)
    
    rf_pred = get_reading_frames(rec, start_c[seq_id], stop_c[seq_id]);
    
    rf_orig = [{'start': f.location.start.position,
                'end': f.location.end.position,
                'length': (f.location.end.position - f.location.start.position) / 3,
                'strand':-f.strand}  
               for f in rec.features if f.type == "CDS"]
    
    
    cds = [f for f in rec.features if f.type == "CDS"]
    l = sorted([i['length'] for i in rf_orig])
    print "number of orfs: %5d" % (len(cds))
    print "average length: %7.2f" % (float(sum(l)) / len(l))
    print "mean length:    %5d" % (l[len(l) / 2])