def getDict(fname = 'data/mitochondrion.txt'): ids = ['NC_000845.1', 'NC_004299.1', 'AC_000022.2', 'NC_002083.1', 'NC_001643.1', 'NC_011137.1', 'NC_012920.1', 'NC_001645.1', 'NC_002008.4', 'NC_006580.1', 'NC_012420.1', 'NC_011391.1', 'NC_012061.1', 'NC_001640.1'] a = [] for i in ids: r = functions.get_sequence(i) d = { 'seq' : r, 'id' : i, 'latin' : r.annotations['source'].split('(')[0].replace('mitochondrion','').strip(), 'english' : r.annotations['source'].split('(')[1].replace(')','').strip(), 'COX3' : [f.qualifiers['translation'][0] for f in r.features if f.type == 'CDS' and f.qualifiers['gene'][0] == "COX3"][0] } a.append(d) return a
#AllChains[file][chain][chain_object_id] = functions.get_backbone_atoms_nucleicacids(chaincheck) <<< incase i need this (superimposer does it) i += 1 """ in order to avoid calculating the sequences multiple times I am just going place the sequences and their identifiers in a list of tuples to be used in the Alignment""" if options.verbose: print("Saving sequences...\n\n") forAlignmentlist = [] for file in AllChains: i = 0 for chain in AllChains[file]: for new_id in AllChains[file][chain]: addtuple = (file + "_" + chain, functions.get_sequence(AllChains[file][chain][new_id])) forAlignmentlist.append(addtuple) i += 1 ############# ############# ############# Dealing with H**o-mers ############# ############# ############# """ If the protein is a H**o-mer we will only have a single file to deal with in that case the user has to provide us with an integer of the amount of repeating chains of that homomer in the end structure.""" ### ### The method of growing the model here is the same as the original which will occur ### Further on (if no stoichiometry was provided). The process is explained there ###
l = sorted([(f.location.end.position - f.location.start.position) / 3 for f in cds]) print "number of orfs: %5d" % (len(cds)) print "average length: %7.2f" % (float(sum(l)) / len(l)) print "mean length: %5d" % (l[len(l) / 2]) if __name__ == "__main__": seq_id = 'NC_006058' start_c = {'NC_006058' : set(["ATG"]), 'NC_007346' : set(["ATG", "TTG", "CTG"]) } stop_c = {'NC_006058' : set(["TGA"]), 'NC_007346' : set(["TAA", "TAG", "TGA"]) } rec = functions.get_sequence(seq_id) rf_pred = get_reading_frames(rec, start_c[seq_id], stop_c[seq_id]); rf_orig = [{'start': f.location.start.position, 'end': f.location.end.position, 'length': (f.location.end.position - f.location.start.position) / 3, 'strand':-f.strand} for f in rec.features if f.type == "CDS"] cds = [f for f in rec.features if f.type == "CDS"] l = sorted([i['length'] for i in rf_orig]) print "number of orfs: %5d" % (len(cds)) print "average length: %7.2f" % (float(sum(l)) / len(l)) print "mean length: %5d" % (l[len(l) / 2])