def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) #print("Calculating md5sum.") #md5sum = hashlib.md5() # with open(record['local_url'], 'rb') as f: # for chunk in iter(lambda: f.read(1024*1024), ''): # md5sum.update(chunk) node.study = 'prediabetes' node.comment = str(record['sample_name_id']) + '.hostseqprep' node.prepared_by = 'Varsha Rao and Reza Sailani' node.sequencing_contact = 'Varsha Rao and Reza Sailani' node.sequencing_center = 'Stanford University' node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/' + str(node.format) node.exp_length = 0 #record['exp_length'] node.local_file = str(record['sample_name_id']) + '.hostseqprep' node.storage_duration = int('1') # node.checksums = {'md5': md5sum.hexdigest(), 'sha256':record['sha256']} # node.size = int(record['size']) node.tags = list_tags(node.tags, # 'test', # for debug!! 'sample name: '+record['visit_id'], 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'file prefix: '+ record['prep_id'], 'file name: '+ str(record['sample_name_id']) + '.hostseqprep', ) node.lib_layout = record['lib_layout'] node.lib_selection = record['lib_selection'] node.ncbi_taxon_id = '9606' node.prep_id = record['prep_id'] parent_link = {'prepared_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name, fieldnames=csv_fieldnames) local_file_name = os.path.basename(record['local_file']) node.comment = local_file_name node.study = 'prediabetes' node.sequence_type = 'nucleotide' node.seq_model = record['seq_model'] node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/FASTQ_format' node.exp_length = 0 # record['exp_length'] # node.local_file = record['local_file'] if record['consented'] == 'YES' else '' if record['consented'] == 'YES': node.local_file = record['local_file'] node.checksums = {'md5': record['md5'], 'sha256': record['sha256']} node.size = int(record['size']) else: node.private_files = True node.checksums = {'md5': '00000000000000000000000000000000'} node.size = 0 node.tags = list_tags( 'sequence type: ' + 'RNAseq', 'jaxid (sample): ' + record['jaxid_sample'], 'sample name: ' + record['sample_name_id'], 'body site: ' + record['body_site'], 'subject id: ' + record['rand_subject_id'], 'study: ' + 'prediabetes', 'prep_id:' + record['prep_id'], 'file name: ' + local_file_name, ) parent_link = {'sequenced_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name, fieldnames=csv_fieldnames) node.comment = record['prep_id'] node.frag_size = 301 # goal size node.lib_layout = 'paired 301bp' node.lib_selection = '' node.mimarks = generate_mimarks(record) node.ncbi_taxon_id = '408170' \ if 'stool' == record['body_site'] \ else '1131769' # nasal # ST: http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=408170 # NS: http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=1131769 node.prep_id = record['prep_id'] node.sequencing_center = 'Jackson Laboratory for Genomic Medicine' node.sequencing_contact = 'George Weinstock' node.storage_duration = 2112 node.tags = list_tags(node.tags, # 'test', # for debug!! 'jaxid (sample): '+record['jaxid_sample'], 'jaxid (library): '+record['jaxid_library'] \ if record['jaxid_library'] \ else 'jaxid (library): unknown', 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'study: prediabetes', 'file prefix: '+ record['prep_id'], ) parent_link = {'prepared_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) local_file_name = os.path.basename(record['local_file_clean']) node.study = 'prediabetes' node.comment = local_file_name node.format = record['format'] # only 'fasta', 'fastq' allowed! node.format_doc = 'https://en.wikipedia.org/wiki/' +\ record['format'].upper() + '_format' if record['consented'] == 'YES': node.local_file = record['local_file_clean'] node.checksums = {'md5':record['clean_md5'], 'sha256':record['clean_sha256']} node.size = int(record['clean_size']) else: node.private_files = True node.checksums = {'md5': '00000000000000000000000000000000'} node.size = 1 node.tags = list_tags( 'study: prediabetes', 'subject id: '+record['rand_subject_id'], 'sample name: '+record['sample_name_id'], 'body site: '+record['body_site'], 'prep_id:' + record['prep_id'], 'raw_file_id: '+ record['raw_file_id'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'computed_from':[parent_id]} if not node.is_valid(): invalidities = str(node.validate()) err_str = "Invalid node {}!\t\t{}".format(node_type, invalidities) log.error(err_str) # vals = [record] # vals.append(invalidities) write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False elif node.save(): log.info('node saved: '+str(node.comment)) write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: log.info('node NOT saved: '+str(node.comment)) write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['local_uri'] node.prepared_by = record['sequencing_contact'] node.sequence_type = 'nucleotide' node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/' node.exp_length = 0 #record['exp_length'] node.local_file = [record['local_uri']] # node.checksums = {'md5':record['md5'], 'sha256':record['sha256']} # node.size = int(record['size']) node.tags = list_tags(node.tags, # 'test', # for debug!! 'sample name: '+record['visit_id'], 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'study: prediabetes', 'file prefix: '+ record['prep_id'], 'file name: '+ record['local_uri'], ) node.lib_layout = record['lib_layout'] node.lib_selection = record['lib_selection'] node.ncbi_taxon_id = record['ncbi_taxon_id'] node.prep_id = record['prep_id'] parent_link = {'sequenced_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['prep_id'] + ' ... Quality trimmed, cleaned, '\ + 'dehosted, converted fastq to fasta.' node.format = record['format'] # only 'fasta', 'fastq' allowed! node.format_doc = 'https://en.wikipedia.org/wiki/' +\ record['format'].upper() + '_format' node.local_file = record['local_file'] node.size = int(record['size']) node.checksums = {'md5':record['md5'], 'sha256':record['sha256']} node.tags = list_tags(node.tags, # 'test', # for debug!! 'jaxid (sample): '+record['jaxid_sample'], 'jaxid (library): '+record['jaxid_library'] \ if record['jaxid_library'] \ else 'jaxid (library): none', 'sample name: '+record['sample_name_id'], 'body site: '+record['body_site'], 'visit id: '+record['visit_id'], 'subject id: '+record['rand_subject_id'], 'study: prediabetes', 'dna_prep_id: '+ record['prep_id'], 'raw_file_id: '+ record['raw_file_id'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'computed_from':[parent_id]} csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['local_file'] node.sequence_type = 'nucleotide' node.seq_model = record['seq_model'] node.format = 'fastq' node.format_doc = 'https://en.wikipedia.org/wiki/FASTQ_format' node.exp_length = 0 #record['exp_length'] node.local_file = record['local_file'] node.checksums = {'md5':record['MD5SUM'], 'sha256':record['SHA256']} node.size = int(record['SIZE']) node.tags = list_tags(node.tags, 'sample name: ' + record['visit_id'], 'body site: ' + record['body_site'], 'visit id: ' + record['visit_id'], 'subject id: ' + record['rand_subject_id'], 'file prefix: ' + record['sample_name_id'] + '.hostseqprep', 'file name: ' + record['local_file'], 'sub-group: ' + record['subtype'], ) parent_link = {'sequenced_from':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) node.name = record['sample_name_id'] node.body_site = record['body_site'].lower() node.fma_body_site = record['fma_body_site'] node.mixs = generate_mixs(record) node.tags = list_tags(node.tags, # 'test', # for debug!! 'sample id: ' + record['sample_name_id'], 'visit id: ' + record['visit_id'], 'subject id: ' + record['rand_subject_id'], 'study: prediabetes', 'substudy: ' + record['Group'], ) # node._attribs = record['attributes'] parent_link = {'collected_during':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link if not node.is_valid(): invalids = data_file_name[:-4]+'.invalid_records.csv' write_csv_headers(invalids, fieldnames=csv_fieldnames) write_out_csv(invalids, fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): submitted = data_file_name[:-4]+'.submitted.csv' write_csv_headers(submitted, fieldnames=csv_fieldnames) write_out_csv(submitted, fieldnames=csv_fieldnames, values=[record,]) return node else: unsaved = data_file_name[:-4]+'.unsaved.csv' write_csv_headers(unsaved, fieldnames=csv_fieldnames) write_out_csv(unsaved, fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.study = 'prediabetes' node.comment = record['sample_name_id'] + ".metabolome" node.format = record['format'] ## FIX TO HANDLE mzXML files node.format_doc = 'https://en.wikipedia.org/wiki/Mass_spectrometry_data_format' node.subtype = 'host' node.checksums = {'md5':record['md5'], 'sha256':record['sha256']} node.local_file = record['local_file'] node.tags = list_tags(node.tags, 'sample name: '+ record['sample_name_id'] + ".metabolome", 'visit id: '+ record['visit_id'], 'subject id: '+ record['rand_subject_id'], 'sample fluid type: ' + record['SAMPLE_FLUID_TYPE'], 'type: ' + record['Type'], 'batch: ' + record['BATCH'], 'mode: ' + record['MODE'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'derived_from':[parent_id]} csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames, values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) node.study = 'prediabetes' node.subtype = 'prediabetes' node.tags = list_tags('Race: ' + get_race(record['race_code']), 'age: ' + record['age'], 'gender: ' + get_gender(record['gender']), ) parent_link = {'associated_with':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link if not node.is_valid(): invalids = data_file_name[:-4]+'.invalid_records.csv' write_csv_headers(invalids, fieldnames=csv_fieldnames) write_out_csv(invalids, fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): submitted = data_file_name[:-4]+'.submitted.csv' write_csv_headers(submitted, fieldnames=csv_fieldnames) write_out_csv(submitted, fieldnames=csv_fieldnames, values=[record,]) return node else: unsaved = data_file_name[:-4]+'.unsaved.csv' write_csv_headers(unsaved, fieldnames=csv_fieldnames) write_out_csv(unsaved, fieldnames=csv_fieldnames, values=[record,]) return False
def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) write_csv_headers(data_file_name,fieldnames=csv_fieldnames) node.visit_id = record['visit_id'] node.visit_number = int(record['visit_number']) node.interval = int(record['interval']) node.tags = list_tags(node.tags, # 'test', # for debug!! 'rand_subject_id: '+record['rand_subject_id'], 'study: prediabetes', # 'study: '+record['study'], # 'sub_study: '+record['sub_study'], ) log.debug('parent_id: '+str(parent_id)) node.links = {'by':[parent_id]} csv_fieldnames = get_field_header(data_file_name) if not node.is_valid(): write_out_csv(data_file_name+'_invalid_records.csv', fieldnames=csv_fieldnames,values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save(): write_out_csv(data_file_name+'_submitted.csv', fieldnames=csv_fieldnames,values=[record,]) return node else: write_out_csv(data_file_name+'_unsaved_records.csv', fieldnames=csv_fieldnames,values=[record,]) return False
def validate_record(parent_id, node, record): """update record fields validate node if valid, save, if not, return false """ log.debug("in validate/save: "+node_type) node.rand_subject_id = record['rand_subject_id'] node.gender = get_gender(record['gender']) node.race = get_race(record['race_code']) node.tags = list_tags( #node.tags, # 'test', # for debug!! 'age: '+record['age'] if record['age'] else 'unk', 'study: prediabetes', ) node.links = {'participates_in':[parent_id]} if not node.is_valid(): invalidities = node.validate() err_str = "Invalid!\n{}".format("\n".join(invalidities)) log.error(err_str) raise Exception(err_str) elif node.save(): return node else: return False
NodeLoadFunc = 'load_visitattribute' return load_node(internal_id, search_field, NodeTypeName, NodeLoadFunc) def validate_record(parent_id, node, record, data_file_name=node_type): """update record fields validate node if valid, save, if not, return false """ log.info("in validate/save: "+node_type) csv_fieldnames = get_field_header(data_file_name) node.study = 'prediabetes' node.subtype = 'prediabetes' node.tags = list_tags('') parent_link = {'associated_with':[parent_id]} log.debug('parent_id: '+str(parent_link)) node.links = parent_link if not node.is_valid(): invalids = data_file_name[:-4]+'.invalid_records.csv' write_csv_headers(invalids, fieldnames=csv_fieldnames) write_out_csv(invalids, fieldnames=csv_fieldnames, values=[record,]) invalidities = node.validate() err_str = "Invalid {}!\n\t{}".format(node_type, str(invalidities)) log.error(err_str) # raise Exception(err_str) elif node.save():