def persist_part_overhang_annotations(repo, vectorname, part, authorid, date): nsa, fiveprimeoverhang, threeprimeoverhang = get_nucseq_annotations( repo, vectorname) repository.add_feature_to_nucseq( repo, fiveprimeoverhang['feature']['name'] + " in " + part['name'], part['nucseq'], fiveprimeoverhang['feature'], 0, authorid, date) repository.add_feature_to_nucseq( repo, threeprimeoverhang['feature']['name'] + " in " + part['name'], part['nucseq'], threeprimeoverhang['feature'], len(part['nucseq']['sequence']) - len(threeprimeoverhang['feature']['nucseq']['sequence']), authorid, date)
def persist_part_feature(repo, vectorname, part, familyname, authorid, date): nsa, fiveprimeoverhang, threeprimeoverhang = get_nucseq_annotations( repo, vectorname) partseq = part['nucseq']['sequence'].strip().lower() start = len(fiveprimeoverhang['feature']['nucseq']['sequence'].strip()) end = len(partseq) - len( threeprimeoverhang['feature']['nucseq']['sequence'].strip()) partfeatureseq = partseq[start:end] partfeature = repository.create_feature(repo, 'Feature-' + part['name'], partfeatureseq, familyname, date) repository.add_feature_to_nucseq( repo, 'Feature-' + part['name'], part['nucseq'], partfeature, len(fiveprimeoverhang['feature']['nucseq']['sequence']), authorid, date)
def create_moclo_constituent_part_features(repo, pt, constituentparts, authorid, datecreated): ''' Adds overhangs for constituent parts to repo. ''' currfeaturestart = 0 count = 0 for cp in constituentparts: fiveprimeoverhang = repository.get_moclo_overhang_annotation(repo, cp['nucseq'], 'FIVE_PRIME') # For the first part, find the fiveprimeoverhang, add it to the repo if cp == constituentparts[0]: featurestart = pt['nucseq']['sequence'].strip()\ .index(fiveprimeoverhang['feature']['nucseq']['sequence'].strip(), currfeaturestart) if featurestart < 0: raise ValueError ('Unable to find overhang annotation in the composite part.') if featurestart != currfeaturestart: raise ValueError("5' overhang found in composite part does not follow " "Moclo overhang rule: " + str(featurestart) + ' ' + str(currfeaturestart)) repository.add_feature_to_nucseq(repo, pt['name'], pt['nucseq'], fiveprimeoverhang['feature'], featurestart, authorid, datecreated) currfeaturestart = featurestart + len(fiveprimeoverhang['feature']['nucseq']['sequence'].strip()) # For each overhang annotation, find the start position in the sequence, and add it to the repo nsa = get_nonoverhang_annotations(repo, cp['nucseq']) for n in nsa: featurestart = pt['nucseq']['sequence'].strip().index(n['feature']['nucseq']['sequence'] .strip(), currfeaturestart) if featurestart < 0: raise ValueError('Could not find ' + n['feature']['name'] + ' in ' + pt['name'] + '.') currfeaturestart = featurestart + len(n['feature']['nucseq']['sequence'].strip()) repository.add_feature_to_nucseq(repo, pt['name'], pt['nucseq'], n['feature'], featurestart, authorid, datecreated) # Get the threeprimeoverhang anno, find where it starts, add it to the repo threeprimeoverhang = repository.get_moclo_overhang_annotation(repo, cp['nucseq'], 'THREE_PRIME') featurestart = pt['nucseq']['sequence'].strip()\ .index(threeprimeoverhang['feature']['nucseq']['sequence'].strip(), currfeaturestart) if featurestart < 0: raise ValueError('Unable to find overhang annotation in the composite part.') if featurestart != currfeaturestart: raise ValueError("3' overhang found in composite part that does not follow Moclo rules: ", featurestart, " ", currfeaturestart) repository.add_feature_to_nucseq(repo, pt['name'], pt['nucseq'], threeprimeoverhang['feature'], featurestart, authorid, datecreated) currfeaturestart = featurestart + len(fiveprimeoverhang['feature']['nucseq']['sequence'].strip()) count += 1
def process_vectors(repo, project, vectorsfiles, directories, instanceid, authorid, date): for vectorsfile in vectorsfiles: with open(vectorsfile) as file: lines = file.readlines() validate_input_file(lines[0], vectorsfile) if len(lines) <= 1: continue vectors = {} vectors['authorid'] = authorid vectors['datecreated'] = date vectors['description'] = 'Vectors described in ' + instanceid collectionid = uuid.uuid4() vectors['idcollection'] = collectionid repo['collections'].append(vectors) repository.add_object_to_collection(repo, project['idcollection'], vectors['idcollection'], 'COLLECTION', authorid, date) lineno = 0 for line in lines[1:]: tokens = line.split(',') tokens = [t.strip() for t in tokens if len(t.strip()) > 0] if len(tokens) < 5: raise ValueError( 'The Values.csv file does not have the required number of tokens on line ' + lineno) vectorfilename = tokens[0] vectorname = 'Vector-' + tokens[1] resistancename = 'Resistance-' + tokens[2] fiveprimeoverhangname = 'Overhang-' + tokens[3] threeprimeoverhangname = 'Overhang-' + tokens[4] if len(tokens) > 5: description = tokens[5] else: description = 'From ' + vectorfilename + ': ' + \ fiveprimeoverhangname + ', ' + \ threeprimeoverhangname + ', ' + resistancename directories = [d for d in directories if os.path.isdir(d)] for directory in directories: files = os.listdir(directory) if vectorfilename in files: vectorsequence = read_genbank_file(directory + '/' + vectorfilename) break vector = {} vector['authorid'] = authorid vector['datecreated'] = date vector['description'] = description vector['name'] = vectorname vectorid = uuid.uuid4() vector['idvector'] = vectorid nucseq = {} nucseq['datecreated'] = date nucseq['idnucseq'] = vectorid nucseq['sequence'] = vectorsequence vector['nucseq'] = nucseq vector['iscircular'] = True repo['nucseq'].append(nucseq) repo['vectors'].append(vector) ft2 = {} foundfeature1 = False foundfeature2 = False overhangfeatures = repository.get_features_by_family_name( repo, 'overhang') for feature in overhangfeatures: if not foundfeature1 and feature['name'].upper( ) == fiveprimeoverhangname.upper(): position = repository.get_overhang_position_in_vector( vectorsequence, feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) foundfeature1 = True if not foundfeature2 and feature['name'].upper( ) == threeprimeoverhangname.upper(): position = repository.get_overhang_position_in_vector( vectorsequence, feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) ft2 = feature foundfeature2 = True if not foundfeature1 or not foundfeature2: raise ValueError( 'The overhangs caused by vector ' + vectorname + ' were not defined in the overhangs manifest.') foundfeature = False for feature in repository.get_features_by_family_name( repo, 'resistance'): if feature['name'].upper() == resistancename.upper(): position = nucseq['sequence'].find( feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) foundfeature = True if not foundfeature: overhangpos = repository.get_overhang_position_in_vector( vectorsequence, ft2['nucseq']['sequence']) if overhangpos < 0: raise ValueError('The overhang ' + ft2['name'] + ' could not be found in the vector ' + vectorname) startpos = overhangpos + len(ft2['nucseq']['sequence']) + 1 resistancesequence = nucseq['sequence'][ startpos:len(nucseq['sequence'])] f = repository.create_feature(repo, resistancename, resistancesequence, 'resistance', date) position = nucseq['sequence'].find(f['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, f, position, authorid, date) repository.add_object_to_collection(repo, collectionid, f['idfeature'], 'FEATURE', authorid, date) repository.add_object_to_collection(repo, collectionid, vectorid, 'VECTOR', authorid, date) lineno += 1