Beispiel #1
0
def load_into_db(report):
    """
    If report hasn't been loaded, and fits format, bulk load into db
    :param report:
    :return boolean:
    """
    if is_loaded(report):
        return False

    media_path = get_media_path()
    report_filename = report.report_file.name[2:]
    print "{}/{}".format(media_path, report.report_file.name)

    checks_out = report_file_formatter(report_filename)
    if checks_out:
        print "{} checks out.".format(media_path + report.report_file.name)

    report_file = open(media_path + report.report_file.name, 'r')
    header_line_dict = get_header_cols_and_delim(report_file)
    headers = header_line_dict['cols']
    (canonical, extra_headers) = classify_headers(headers)
    print "Canonical: {}\nExtra: {}".format(canonical, extra_headers)
    splitby = header_line_dict['delim']

    all_variants = []
    for line in report_file:
        #print "line: {}".format(line)
        data = line.rstrip('\n').split(splitby)
        if len(data) > 1:
            variant = Variant()

            variant.report = report
            # process canonical
            for h in canonical:
                if h in headers:
                    # process the ints
                    if variant_headers[h] == 'int':
                        if data[headers.index(h)]:
                            setattr(variant, h, int(data[headers.index(h)]))
                    # process the floats
                    elif variant_headers[h] == 'float':
                        field = h
                        if '%' in h:
                            field = h.replace('%', 'pct')
                        data[headers.index(h)] = data[headers.index(h)].replace('%', '')
                        if data[headers.index(h)]:
                            # set values that are divided by 0 to tumor alt count
                            if h == 'tn_pct_alt_ratio' and (data[headers.index(h)] == 'NA' or data[headers.index(h)] ==
                                '10000.00'):
                                data[headers.index(h)] = data[headers.index('tumor_alt_count')]
                            setattr(variant, field, float(data[headers.index(h)]))
                    # process the strings
                    elif variant_headers[h] == 'str':
                        field = h
                        if h == 'chr':
                            field = 'chrom'
                        if h == 'gene':
                            field = 'gene_name'
                        setattr(variant, field, data[headers.index(h)])
            # process extra fields
            extra_headers_list = []
            for eh in extra_headers:
                if data[headers.index(eh)]:
                    extra_headers_list.append(
                        '{}={}'.format(eh, data[headers.index(eh)])
                    )
            variant.extra_info = ';'.join(extra_headers_list)
            # this is too slow, 1000000% too slow. do in bulk.
            # variant.save()
            all_variants.append(variant)

    # process all at once
    Variant.objects.bulk_create(all_variants)
    print "Loaded {} variants from file: {}".format(
        len(all_variants), report_file.name
    )

    # Remove uploaded file from server
    #os.remove(media_path + report.report_file.name)
    return True
Beispiel #2
0
def load_into_db(report):
    """
    If report hasn't been loaded, and fits format, bulk load into db
    :param report:
    :return boolean:
    """
    if is_loaded(report):
        return False
    # media path doesn't work here using URL when debug is off
    media_path = settings.MEDIA_ROOT
    report_filename = os.path.basename(report.report_file.name)
    print "{}/{}".format(media_path, report.report_file.name)

    checks_out = report_file_formatter(report_filename)
    if checks_out:
        print "{} checks out.".format(media_path + report.report_file.name)

    report_file = open(media_path + report.report_file.name, 'r')
    header_line_dict = get_header_cols_and_delim(report_file)
    headers = header_line_dict['cols']
    (canonical, extra_headers) = classify_headers(headers)
    print "Canonical: {}\nExtra: {}".format(canonical, extra_headers)
    splitby = header_line_dict['delim']

    all_variants = []
    for line in report_file:
        #print "line: {}".format(line)
        data = line.rstrip('\n').split(splitby)
        if len(data) > 1:
            variant = Variant()

            variant.report = report
            # process canonical
            for h in canonical:
                if h in headers:
                    # process the ints
                    if variant_headers[h] == 'int':
                        if data[headers.index(h)]:
                            setattr(variant, h, int(data[headers.index(h)]))
                    # process the floats
                    elif variant_headers[h] == 'float':
                        field = h
                        if '%' in h:
                            field = h.replace('%', 'pct')
                        data[headers.index(h)] = data[headers.index(
                            h)].replace('%', '')
                        if data[headers.index(h)]:
                            # set values that are divided by 0 to tumor alt count
                            if h == 'tn_pct_alt_ratio' and (
                                    data[headers.index(h)] == 'NA'
                                    or data[headers.index(h)] == '10000.00'):
                                data[headers.index(h)] = data[headers.index(
                                    'tumor_alt_count')]
                            setattr(variant, field,
                                    float(data[headers.index(h)]))
                    # process the strings
                    elif variant_headers[h] == 'str':
                        field = h
                        if h == 'chr':
                            field = 'chrom'
                        if h == 'gene':
                            field = 'gene_name'
                        setattr(variant, field, data[headers.index(h)])
            # process extra fields
            extra_headers_list = []
            for eh in extra_headers:
                if data[headers.index(eh)]:
                    extra_headers_list.append('{}={}'.format(
                        eh, data[headers.index(eh)]))
            variant.extra_info = ';'.join(extra_headers_list)
            # this is too slow, 1000000% too slow. do in bulk.
            # variant.save()
            all_variants.append(variant)

    # process all at once
    Variant.objects.bulk_create(all_variants)
    print "Loaded {} variants from file: {}".format(len(all_variants),
                                                    report_file.name)

    # Remove uploaded file from server
    #os.remove(media_path + report.report_file.name)
    return True