def process_overhangs(repo, project, overhangs_file, instanceid, authorid, date): with open(overhangs_file) as file: inputlines = file.readlines() validate_input_file(inputlines[0], overhangs_file) if len(inputlines) <= 1: return collections = {} collections['authorid'] = authorid collections['datecreated'] = date collections['description'] = 'Overhangs described in ' + instanceid collections['name'] = 'overhang-' + instanceid collectionid = uuid.uuid4() collections['idcollection'] = collectionid repo['collections'].append(collections) repository.add_object_to_collection(repo, collectionid, project['idcollection'], 'COLLECTION', authorid, date) for line in inputlines: if ',' not in line: continue tokens = line.split(',') featurename = 'overhang-' + tokens[0] featuresequence = tokens[1].lower() featureid = repository.create_feature(repo, featurename, featuresequence, 'overhang', date) repository.add_object_to_collection(repo, collectionid, featureid, 'FEATURE', authorid, date)
def add_new_family_to_all_collections(repo, partfamily, allcollections, project, instanceid, authorid, date): family = {} family['name'] = partfamily family['idfamily'] = partfamily repo['families'].append(family) collection = {} collection['authorid'] = authorid collection['datecreated'] = date collection['description'] = 'Parts of ' + partfamily+ ' family described in ' + instanceid collection['name'] = partfamily + '-part-' + instanceid collectionid = uuid.uuid4() collection['idcollection'] = collectionid repo['collections'].append(collection) allcollections[partfamily] = collection repository.add_object_to_collection(repo, project['idcollection'], collection['idcollection'], 'COLLECTION', authorid, date)
def create_collections_by_family(repo, project, instanceid, authorid, date): allcollections = {} families = repo['families'] for fam in families: famname = fam['name'] collection = {} collection['authorid'] = authorid collection['datecreated'] = date collection['description'] = 'Parts of ' + famname + ' family described in ' + instanceid collection['name'] = famname + '-part-' + instanceid collectionid = uuid.uuid4() collection['idcollection'] = collectionid repo['collections'].append(collection) allcollections[famname] = collection repository.add_object_to_collection(repo, project['idcollection'], collection['idcollection'], 'COLLECTION', authorid, date) return allcollections
def process_plasmids(repo, project, plasmidsfiles, directories, instanceid, authorid, date): for plasmidsfile in plasmidsfiles: with open(plasmidsfile) as file: lines = file.readlines() validate_input_file(lines[0], plasmidsfile) if len(lines) <= 1: continue plasmids = {} plasmids['authorid'] = authorid plasmids['datecreated'] = date plasmids['description'] = 'Parts described in ' + instanceid plasmids['name'] = 'part-' + instanceid collectionid = uuid.uuid4() plasmids['idcollection'] = collectionid repo['collections'].append(plasmids) # Create dict of 'family name : collection dict' allcollections = create_collections_by_family(repo, project, instanceid, authorid, date) lineno = 1 for line in lines[1:]: tokens = line.split(',') tokens = [t.strip() for t in tokens if len(t.strip()) > 0] if len(tokens) < 4: raise ValueError( 'The plasmids.csv file does not comply with the format.' 'Line ', lineno, ' has ', len(tokens), ' tokens, but should have at least 4.') plasmidfilename = tokens[0] partfamily = tokens[1].lower() partname = 'Part-' + tokens[2] vectorname = 'Vector-' + tokens[3] if len(tokens) > 4: description = tokens[4] else: description = 'From ' + plasmidfilename + ' part ' + partname + ' vector ' \ + vectorname + ' of type ' + partfamily + '.' directories = [d for d in directories if os.path.isdir(d)] for directory in directories: files = os.listdir(directory) if plasmidfilename in files: plasmidsequence = read_genbank_file(directory + '/' + plasmidfilename) break if not plasmidsequence: raise ValueError('Could not retrieve plasmid sequence.') partsequence = get_part_sequence(repo, plasmidsequence, vectorname) part = repository.persist_part(repo, partname, partsequence, description, True, authorid, date) persist_part_overhang_annotations(repo, vectorname, part, authorid, date) persist_part_feature(repo, vectorname, part, partfamily, authorid, date) repository.add_object_to_collection(repo, collectionid, part['idpart'], 'PART', authorid, date) # TODO - I add new families, instead of raising an error. See Java line 702+ if partfamily.lower() not in allcollections: add_new_family_to_all_collections(repo, partfamily, allcollections, project, instanceid, authorid, date) repository.add_object_to_collection( repo, allcollections[partfamily]['idcollection'], part['idpart'], 'PART', authorid, date) vector = [ v for v in repo['vectors'] if v['name'].lower() == vectorname.lower() ][0] repository.persist_plasmid(repo, 'PLASMID-' + tokens[2], part, vector, authorid, date) lineno += 1
def process_vectors(repo, project, vectorsfiles, directories, instanceid, authorid, date): for vectorsfile in vectorsfiles: with open(vectorsfile) as file: lines = file.readlines() validate_input_file(lines[0], vectorsfile) if len(lines) <= 1: continue vectors = {} vectors['authorid'] = authorid vectors['datecreated'] = date vectors['description'] = 'Vectors described in ' + instanceid collectionid = uuid.uuid4() vectors['idcollection'] = collectionid repo['collections'].append(vectors) repository.add_object_to_collection(repo, project['idcollection'], vectors['idcollection'], 'COLLECTION', authorid, date) lineno = 0 for line in lines[1:]: tokens = line.split(',') tokens = [t.strip() for t in tokens if len(t.strip()) > 0] if len(tokens) < 5: raise ValueError( 'The Values.csv file does not have the required number of tokens on line ' + lineno) vectorfilename = tokens[0] vectorname = 'Vector-' + tokens[1] resistancename = 'Resistance-' + tokens[2] fiveprimeoverhangname = 'Overhang-' + tokens[3] threeprimeoverhangname = 'Overhang-' + tokens[4] if len(tokens) > 5: description = tokens[5] else: description = 'From ' + vectorfilename + ': ' + \ fiveprimeoverhangname + ', ' + \ threeprimeoverhangname + ', ' + resistancename directories = [d for d in directories if os.path.isdir(d)] for directory in directories: files = os.listdir(directory) if vectorfilename in files: vectorsequence = read_genbank_file(directory + '/' + vectorfilename) break vector = {} vector['authorid'] = authorid vector['datecreated'] = date vector['description'] = description vector['name'] = vectorname vectorid = uuid.uuid4() vector['idvector'] = vectorid nucseq = {} nucseq['datecreated'] = date nucseq['idnucseq'] = vectorid nucseq['sequence'] = vectorsequence vector['nucseq'] = nucseq vector['iscircular'] = True repo['nucseq'].append(nucseq) repo['vectors'].append(vector) ft2 = {} foundfeature1 = False foundfeature2 = False overhangfeatures = repository.get_features_by_family_name( repo, 'overhang') for feature in overhangfeatures: if not foundfeature1 and feature['name'].upper( ) == fiveprimeoverhangname.upper(): position = repository.get_overhang_position_in_vector( vectorsequence, feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) foundfeature1 = True if not foundfeature2 and feature['name'].upper( ) == threeprimeoverhangname.upper(): position = repository.get_overhang_position_in_vector( vectorsequence, feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) ft2 = feature foundfeature2 = True if not foundfeature1 or not foundfeature2: raise ValueError( 'The overhangs caused by vector ' + vectorname + ' were not defined in the overhangs manifest.') foundfeature = False for feature in repository.get_features_by_family_name( repo, 'resistance'): if feature['name'].upper() == resistancename.upper(): position = nucseq['sequence'].find( feature['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, feature, position, authorid, date) foundfeature = True if not foundfeature: overhangpos = repository.get_overhang_position_in_vector( vectorsequence, ft2['nucseq']['sequence']) if overhangpos < 0: raise ValueError('The overhang ' + ft2['name'] + ' could not be found in the vector ' + vectorname) startpos = overhangpos + len(ft2['nucseq']['sequence']) + 1 resistancesequence = nucseq['sequence'][ startpos:len(nucseq['sequence'])] f = repository.create_feature(repo, resistancename, resistancesequence, 'resistance', date) position = nucseq['sequence'].find(f['nucseq']['sequence']) repository.add_feature_to_nucseq(repo, vectorname, nucseq, f, position, authorid, date) repository.add_object_to_collection(repo, collectionid, f['idfeature'], 'FEATURE', authorid, date) repository.add_object_to_collection(repo, collectionid, vectorid, 'VECTOR', authorid, date) lineno += 1