def load_into_db(report): """ If report hasn't been loaded, and fits format, bulk load into db :param report: :return boolean: """ if is_loaded(report): return False media_path = get_media_path() report_filename = report.report_file.name[2:] print "{}/{}".format(media_path, report.report_file.name) checks_out = report_file_formatter(report_filename) if checks_out: print "{} checks out.".format(media_path + report.report_file.name) report_file = open(media_path + report.report_file.name, 'r') header_line_dict = get_header_cols_and_delim(report_file) headers = header_line_dict['cols'] (canonical, extra_headers) = classify_headers(headers) print "Canonical: {}\nExtra: {}".format(canonical, extra_headers) splitby = header_line_dict['delim'] all_variants = [] for line in report_file: #print "line: {}".format(line) data = line.rstrip('\n').split(splitby) if len(data) > 1: variant = Variant() variant.report = report # process canonical for h in canonical: if h in headers: # process the ints if variant_headers[h] == 'int': if data[headers.index(h)]: setattr(variant, h, int(data[headers.index(h)])) # process the floats elif variant_headers[h] == 'float': field = h if '%' in h: field = h.replace('%', 'pct') data[headers.index(h)] = data[headers.index(h)].replace('%', '') if data[headers.index(h)]: # set values that are divided by 0 to tumor alt count if h == 'tn_pct_alt_ratio' and (data[headers.index(h)] == 'NA' or data[headers.index(h)] == '10000.00'): data[headers.index(h)] = data[headers.index('tumor_alt_count')] setattr(variant, field, float(data[headers.index(h)])) # process the strings elif variant_headers[h] == 'str': field = h if h == 'chr': field = 'chrom' if h == 'gene': field = 'gene_name' setattr(variant, field, data[headers.index(h)]) # process extra fields extra_headers_list = [] for eh in extra_headers: if data[headers.index(eh)]: extra_headers_list.append( '{}={}'.format(eh, data[headers.index(eh)]) ) variant.extra_info = ';'.join(extra_headers_list) # this is too slow, 1000000% too slow. do in bulk. # variant.save() all_variants.append(variant) # process all at once Variant.objects.bulk_create(all_variants) print "Loaded {} variants from file: {}".format( len(all_variants), report_file.name ) # Remove uploaded file from server #os.remove(media_path + report.report_file.name) return True
def load_into_db(report): """ If report hasn't been loaded, and fits format, bulk load into db :param report: :return boolean: """ if is_loaded(report): return False # media path doesn't work here using URL when debug is off media_path = settings.MEDIA_ROOT report_filename = os.path.basename(report.report_file.name) print "{}/{}".format(media_path, report.report_file.name) checks_out = report_file_formatter(report_filename) if checks_out: print "{} checks out.".format(media_path + report.report_file.name) report_file = open(media_path + report.report_file.name, 'r') header_line_dict = get_header_cols_and_delim(report_file) headers = header_line_dict['cols'] (canonical, extra_headers) = classify_headers(headers) print "Canonical: {}\nExtra: {}".format(canonical, extra_headers) splitby = header_line_dict['delim'] all_variants = [] for line in report_file: #print "line: {}".format(line) data = line.rstrip('\n').split(splitby) if len(data) > 1: variant = Variant() variant.report = report # process canonical for h in canonical: if h in headers: # process the ints if variant_headers[h] == 'int': if data[headers.index(h)]: setattr(variant, h, int(data[headers.index(h)])) # process the floats elif variant_headers[h] == 'float': field = h if '%' in h: field = h.replace('%', 'pct') data[headers.index(h)] = data[headers.index( h)].replace('%', '') if data[headers.index(h)]: # set values that are divided by 0 to tumor alt count if h == 'tn_pct_alt_ratio' and ( data[headers.index(h)] == 'NA' or data[headers.index(h)] == '10000.00'): data[headers.index(h)] = data[headers.index( 'tumor_alt_count')] setattr(variant, field, float(data[headers.index(h)])) # process the strings elif variant_headers[h] == 'str': field = h if h == 'chr': field = 'chrom' if h == 'gene': field = 'gene_name' setattr(variant, field, data[headers.index(h)]) # process extra fields extra_headers_list = [] for eh in extra_headers: if data[headers.index(eh)]: extra_headers_list.append('{}={}'.format( eh, data[headers.index(eh)])) variant.extra_info = ';'.join(extra_headers_list) # this is too slow, 1000000% too slow. do in bulk. # variant.save() all_variants.append(variant) # process all at once Variant.objects.bulk_create(all_variants) print "Loaded {} variants from file: {}".format(len(all_variants), report_file.name) # Remove uploaded file from server #os.remove(media_path + report.report_file.name) return True