Beispiel #1
0
def insertAnalysisFromTable(filename, projectID, **kw):
    resp = {'ok': False, 'msg': [], 'rows': 0}

    # Test for needed arguments
    try:
        dataset = kw['dataset']
        method = kw['method']
        category = kw['category']
    except:
        resp['msg'].append(
            'Missing arguments. The "dataset", "method", and "category" information is required.'
        )
        return resp

    # Test File input
    try:
        fh = open(filename, 'U')
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    # Test project ID input
    try:
        project = Project.objects.get(
            pk=int(projectID))  # attempt to find the project for the given ID
    except Project.DoesNotExist:
        resp["msg"].append(
            'Project "%s" does not exist' %
            projectID)  # report if the project ID does not exist
        return resp

    header = fh.next().strip().split('\t')
    ctr = 0
    regex = re.compile(
        '[^0-9a-zA-Z_.]')  # this is a list of the valid characters
    for row in fh:
        ctr += 1
        line = row.strip().split('\t')
        s = line[0]
        if len(line) != len(header):
            resp["msg"].append(
                'Data row "%d" has a length "%d", which does not match the header length "%d"'
                % (ctr, len(line), len(header)))
        if bool(regex.findall(s)):
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed'
                % s)
        if s[0].isdigit() or s[0] == "_":
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", the first character must be a letter or a dot'
                % s)
        if s[0] == "." and s[1].isdigit():
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter'
                % s)
        for i in range(1, len(line)):
            try:
                float(line[i])
            except:
                resp["msg"].append(
                    'File "%s" fails validation. Field "%s" contains non-numeric data. Line: %d. Column %d'
                    % (filename, field, ctr, i + 1))
    if len(resp['msg']) != 0:
        return resp
    else:
        resp['ok'] = True
        resp['rows'] = ctr
    fh.seek(0)
    fh.next()
    for row in fh:
        line = row.strip().split('\t')
        s = line[0]
        for i in range(1, len(line)):
            entity = header[i]
            value = float(line[i])
            analysis = Analysis(
                project=project,
                sample=s,
                dataset=dataset,
                method=method,
                category=category,
                entity=entity,
                numreads=int(round(value)),
                profile=value,
                avgscore=1.0,
            )
            analysis.save()
    ok = updateSampleCounts(projectID)
    return resp
Beispiel #2
0
def insertMetadataFromFile(filename, projectID):
    resp = {'ok': False, 'msg': [], 'rows': 0}

    # Test File input
    try:
        #inputdata = read_input(filename)
        fh = open(filename)
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    # Test project ID input
    try:
        project = Project.objects.get(
            pk=int(projectID))  # attempt to find the project for the given ID
    except Project.DoesNotExist:
        resp["msg"].append(
            'Project "%s" does not exist' %
            projectID)  # report if the project ID does not exist
        return resp

    # Once usable data was input check validity of file contents
    #header = inputdata.pop(0)
    header = fh.next().strip().split('\t')
    count = 0
    typecheck = {}
    validsamples = {}
    #for idx, line in enumerate(inputdata):
    for line_str in fh:
        count += 1
        line = line_str.strip().split('\t')
        s = line[0]
        # attribute data has 4 columns
        if len(line) != 4:
            resp["msg"].append(
                'File "%s" fails validation. Must be 4 columns. Line: %d. Content: %s'
                % (filename, count, line_str))
            return resp
        #if line[3] not in typecheck:
        #   typecheck[line[2]] = getfieldtype(line[3])
        #elif getfieldtype(line[3]) != typecheck[line[2]]:
        #   resp["msg"].append('File "%s" fails validation. Field "%s" has inconsistent type. Expected a "%s" but found a "%s". Line: %d. Content: %s' % (filename, line[2], typecheck[line[2]], getfieldtype(line[3]), count, line))
        # Check if entries for these samples already exist and sample formatting
        #samples = list(set(zip(*inputdata)[0]))
        regex = re.compile(
            '[^0-9a-zA-Z_.]')  # this is a list of the valid characters
        #for s in samples:
        if s in validsamples: continue
        if bool(regex.findall(s)):
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed'
                % s)
        if s[0].isdigit() or s[0] == "_":
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", the first character must be a letter or a dot'
                % s)
        if s[0] == "." and s[1].isdigit():
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter'
                % s)
        if Attributes.objects.filter(project=project, sample=s,
                                     field=line[3]).exists():
            resp["msg"].append(
                'Data exists for field %s for sample %s in project %s (%s). Aborting load...'
                % (line[3], s, projectID, project.name))
    if len(resp['msg']) != 0:
        return resp
    else:
        resp['ok'] = True
        resp['rows'] = count
    fh.seek(0)
    fh.next()
    # Insert Validated Input into the database for the project
    for line in fh:
        data = line.strip().split('\t')
        data[3] = dateparse(data[3])
        if isinstance(data[3], datetime.datetime):
            data[3] = data[3].strftime('%Y-%m-%d')

        attributes = Attributes(
            project=project,
            sample=data[0],
            category=data[1],
            field=data[2],
            value=data[3],
        )
        if not AttributeInfo.objects.filter(project=project,
                                            name=attributes.field).exists():
            newinfo = AttributeInfo(
                project=project,
                name=attributes.field,
                fieldtype=getfieldtype(attributes.value),
                values="",
            )
            newinfo.save()
        attributes.save()
    fh.close()
    ok = updateSampleCounts(projectID)
    return resp
Beispiel #3
0
def insertMetadataFromTable(filename, projectID):
    resp = {'ok': False, 'msg': [], 'rows': 0}

    # Test File input
    try:
        fh = open(filename, 'U')
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    # Test project ID input
    try:
        project = Project.objects.get(
            pk=int(projectID))  # attempt to find the project for the given ID
    except Project.DoesNotExist:
        resp["msg"].append(
            'Project "%s" does not exist' %
            projectID)  # report if the project ID does not exist
        return resp

    # Once usable data was input check validity of file contents
    header = fh.next().strip().split('\t')

    ctr = 0
    typecheck = dict()
    regex = re.compile(
        '[^0-9a-zA-Z_.]')  # this is a list of the valid characters
    for row in fh:
        ctr += 1
        line = row.strip().split('\t')
        s = line[0]
        if len(line) != len(header):
            resp["msg"].append(
                'Data row "%d" has a length "%d", which does not match the header length "%d"'
                % (ctr, len(line), len(header)))
        if bool(regex.findall(s)):
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed'
                % s)
        if s[0].isdigit() or s[0] == "_":
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", the first character must be a letter or a dot'
                % s)
        if s[0] == "." and s[1].isdigit():
            resp["msg"].append(
                'Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter'
                % s)
        if Attributes.objects.filter(project=project,
                                     sample=s,
                                     field__in=line[1:]).exists():
            resp["msg"].append(
                'Field data exists for sample %s in project %s (%s). Aborting load...'
                % (line[3], s, projectID, project.name))
        for i in range(1, len(line)):
            field = header[i]
            value = line[i]
            if field not in typecheck:
                typecheck[field] = getfieldtype(value)
            elif getfieldtype(value) != typecheck[field]:
                resp["msg"].append(
                    'File "%s" fails validation. Field "%s" has inconsistent type. Expected a "%s" but found a "%s". Line: %d. Column %d'
                    % (filename, field, typecheck[field], getfieldtype(value),
                       ctr, i + 1))

    if len(resp['msg']) != 0:
        return resp
    else:
        resp['ok'] = True
        resp['rows'] = ctr
    fh.seek(0)
    fh.next()
    for row in fh:
        line = row.strip().split('\t')
        s = line[0]
        for i in range(1, len(line)):
            field = header[i]
            value = line[i]
            value = dateparse(value)
            if isinstance(value, datetime.datetime):
                value = value.strftime('%Y-%m-%d')
            attributes = Attributes(
                project=project,
                sample=s,
                category='metadata',
                field=field,
                value=value,
            )
            if not AttributeInfo.objects.filter(
                    project=project, name=attributes.field).exists():
                newinfo = AttributeInfo(
                    project=project,
                    name=attributes.field,
                    fieldtype=getfieldtype(attributes.value),
                    values="",
                )
                newinfo.save()
            attributes.save()
    fh.close()
    ok = updateSampleCounts(projectID)
    return resp
Beispiel #4
0
def insertAnalysisFromFile(filename, projectID, taxonomy=None):
    resp = {'ok': False, 'msg': [], 'rows': 0}
    samples = dict()

    # Test File input
    try:
        fh = fh = open(filename, 'rU')
        num_lines = sum(1 for line in fh)
        fh.seek(0)
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    print >> sys.stderr, "file contains %d lines" % num_lines

    # Test project ID input
    try:
        project = Project.objects.get(
            pk=int(projectID))  # attempt to find the project for the given ID
    except Project.DoesNotExist:
        resp['msg'].append(
            'Project "%s" does not exist' %
            projectID)  # report if the project ID does not exist
        return resp

    # test taxonomy input
    if taxonomy:
        try:
            taxonomy = Taxonomy.objects.get(pk=int(taxonomy))
        except:
            resp['msg'].append(
                'Unable to determine the taxonomy information (%s)' % taxonomy)
            return resp

    # Once usable data was input check validity of file contents
    header = fh.next()
    count = 0
    taxas = dict()

    print >> sys.stderr, "starting line validation"
    start = datetime.datetime.now()
    regex = re.compile(
        '[^0-9a-zA-Z_.]')  # this is a list of the valid characters
    for idx, line in enumerate(fh):
        count += 1

        line = prepareAnalysisLine(line)
        samples[line[0]] = samples.get(line[0], 0) + 1
        acceptedSamples = dict()

        # check for previous submission
        if line[0] not in acceptedSamples:
            s = line[0]
            if bool(regex.findall(s)):
                resp['msg'].append(
                    'Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed'
                    % s)
                return resp
            if s[0].isdigit() or s[0] == "_":
                resp['msg'].append(
                    'Sample syntax is invalid at sample "%s", the first character must be a letter or a dot'
                    % s)
                return resp
            if s[0] == "." and s[1].isdigit():
                resp['msg'].append(
                    'Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter'
                    % s)
                return resp
            if Analysis.objects.filter(project=project,
                                       sample=s,
                                       dataset=line[1],
                                       method=line[2],
                                       category=line[3],
                                       entity=line[4]).exists():
                resp['msg'].append(
                    'Data exists for sample %s in project %s (%s). Aborting load...'
                    % (s, projectID, project.name))
                return resp
            acceptedSamples[line[0]] = True

        # analysis data has 9 columns
        if len(line) != 9:
            resp['msg'].append(
                'File "%s" fails validation. Must be 9 columns. Line: %d. Content: %s'
                % (filename, count, line))
            return resp

        # Num of reads must be integer
        #try: inputdata[idx][6] = int(line[6])
        try:
            int(line[6])
        except:
            resp['msg'].append(
                'File "%s" fails validation. Column 7 must be an integer. Line: %d. Content: %s'
                % (filename, count, line))
            return resp

        # profile must be a float
        #try: inputdata[idx][7] = float(line[7])
        try:
            float(line[7])
        except:
            resp['msg'].append(
                'File "%s" fails validation. Column 8 must be a float. Line: %d. Content: %s'
                % (filename, count, line))
            return resp

        # average score must be a float
        #try: inputdata[idx][8] =float(line[8])
        try:
            float(line[8])
        except:
            resp['msg'].append(
                'File "%s" fails validation. Column 9 must be a float. Line: %d. Content: %s'
                % (filename, count, line))
            return resp

        # try to find taxa if applicable
        if taxonomy:
            try:
                #tax_id = int(line[5])
                tax_id = line[5]
                if tax_id not in taxas:
                    taxas[tax_id] = TaxaTree.objects.get(taxonomy=taxonomy,
                                                         tax_id=int(line[5]))
            except:
                resp['msg'].append(
                    'File "%s" fails validation. Unable to find taxa. Line %d. Content: %s'
                    % (filename, count, line))
                return resp
        else:
            #taxas[int(line[5])] = None
            taxas[line[5]] = None
        if (count % 10000 == 0):
            elapsed = (datetime.datetime.now() - start).seconds
            if elapsed == 0: elapsed = 1
            print >> sys.stderr, "\t%d lines checked of %d [about %d seconds remaining]" % (
                count, num_lines, (num_lines - count) / (count / elapsed))

    print >> sys.stderr, "starting sample validation"
    # Check if entries for these samples already exist
    #regex = re.compile('[^0-9a-zA-Z_.]') # this is a list of the valid characters
    #for s in samples:
    #    if bool(regex.findall(s)):
    #        resp['msg'].append('Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed' %s)
    #        return resp
    #    if s[0].isdigit() or s[0] == "_":
    #        resp['msg'].append('Sample syntax is invalid at sample "%s", the first character must be a letter or a dot' %s)
    #        return resp
    #    if s[0] == "." and s[1].isdigit():
    #        resp['msg'].append('Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter' %s)
    #        return resp

    # Insert Validated Input into the database for the project
    fh.seek(0)
    fh.next()  # skip header
    print >> sys.stderr, "inserting data"
    count = 0
    batch = []
    start = datetime.datetime.now()
    for line in fh:
        count += 1
        data = prepareAnalysisLine(line)
        analysis = Analysis(
            project=project,
            sample=data[0],
            dataset=data[1],
            method=data[2],
            category=data[3],
            entity=data[4],
            #taxatree = taxas[int(data[5])],
            taxatree=taxas[data[5]],
            numreads=int(data[6]),
            profile=float(data[7]),
            avgscore=float(data[8]),
        )
        batch.append(analysis)
        if (count % 10000 == 0):
            Analysis.objects.bulk_create(batch)
            elapsed = (datetime.datetime.now() - start).seconds
            if elapsed == 0: elapsed = 1
            print >> sys.stderr, "\t%d rows added of %d [about %d seconds remaining]" % (
                count, num_lines, (num_lines - count) / (count / elapsed))
            batch = []
    if len(batch) > 0:
        Analysis.objects.bulk_create(batch)
    ok = updateSampleCounts(projectID)
    if len(resp['msg']) == 0: resp['ok'] = True
    resp['rows'] = count
    fh.close()
    return resp
Beispiel #5
0
def insertAnalysisFromFile(filename, projectID, taxonomy=None):
    resp = {'ok': False, 'msg': [], 'rows': 0}
    samples = dict()

    # Test File input
    try: 
        fh = fh = open(filename, 'rU')
        num_lines = sum(1 for line in fh)
        fh.seek(0)
    except: 
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    print >> sys.stderr, "file contains %d lines" % num_lines
    
    # Test project ID input
    try: project = Project.objects.get(pk=int(projectID))                                     # attempt to find the project for the given ID
    except Project.DoesNotExist: 
        resp['msg'].append('Project "%s" does not exist' % projectID)			      # report if the project ID does not exist
	return resp

    # test taxonomy input
    if taxonomy:
        try: 
           taxonomy = Taxonomy.objects.get(pk=int(taxonomy))
        except: 
            resp['msg'].append('Unable to determine the taxonomy information (%s)' % taxonomy)
            return resp

    # Once usable data was input check validity of file contents
    header = fh.next()
    count = 0
    taxas = dict()

    print >> sys.stderr, "starting line validation"
    start = datetime.datetime.now()
    regex = re.compile('[^0-9a-zA-Z_.]') # this is a list of the valid characters
    for idx, line in enumerate(fh):
        count+=1

        line = prepareAnalysisLine(line)
        samples[line[0]] = samples.get(line[0], 0) + 1
        acceptedSamples = dict()

        # check for previous submission
        if line[0] not in acceptedSamples:
            s = line[0]
            if bool(regex.findall(s)):
                resp['msg'].append('Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed' %s)
                return resp
            if s[0].isdigit() or s[0] == "_":
                resp['msg'].append('Sample syntax is invalid at sample "%s", the first character must be a letter or a dot' %s)
                return resp
            if s[0] == "." and s[1].isdigit():
                resp['msg'].append('Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter' %s)
                return resp
            if Analysis.objects.filter(project = project, sample = s, dataset=line[1], method=line[2], category=line[3], entity=line[4]).exists():
                resp['msg'].append('Data exists for sample %s in project %s (%s). Aborting load...' %(s, projectID, project.name) )
                return resp
            acceptedSamples[line[0]] = True

        # analysis data has 9 columns
        if len(line) != 9: 
            resp['msg'].append('File "%s" fails validation. Must be 9 columns. Line: %d. Content: %s' % (filename, count, line))
            return resp

        # Num of reads must be integer
        #try: inputdata[idx][6] = int(line[6])
        try: int(line[6])
        except: 
            resp['msg'].append('File "%s" fails validation. Column 7 must be an integer. Line: %d. Content: %s' % (filename, count, line))
            return resp

        # profile must be a float
        #try: inputdata[idx][7] = float(line[7])
        try: float(line[7])
        except:  
            resp['msg'].append('File "%s" fails validation. Column 8 must be a float. Line: %d. Content: %s' % (filename, count, line))
            return resp

        # average score must be a float
        #try: inputdata[idx][8] =float(line[8])
        try: float(line[8])
        except: 
            resp['msg'].append('File "%s" fails validation. Column 9 must be a float. Line: %d. Content: %s' % (filename, count, line))
            return resp

        # try to find taxa if applicable
        if taxonomy:
            try: 
                #tax_id = int(line[5])
                tax_id = line[5]
                if tax_id not in taxas:
                   taxas[tax_id] = TaxaTree.objects.get(taxonomy=taxonomy, tax_id=int(line[5]))
            except: 
                resp['msg'].append('File "%s" fails validation. Unable to find taxa. Line %d. Content: %s' % (filename, count, line))
                return resp
        else:
            #taxas[int(line[5])] = None
            taxas[line[5]] = None
        if (count % 10000 == 0): 
            elapsed = (datetime.datetime.now() - start).seconds
            if elapsed == 0: elapsed = 1
            print >> sys.stderr, "\t%d lines checked of %d [about %d seconds remaining]" % (count, num_lines, (num_lines - count)/(count/elapsed))

    print >> sys.stderr, "starting sample validation"
    # Check if entries for these samples already exist
    #regex = re.compile('[^0-9a-zA-Z_.]') # this is a list of the valid characters
    #for s in samples:
    #    if bool(regex.findall(s)): 
    #        resp['msg'].append('Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed' %s)
    #        return resp
    #    if s[0].isdigit() or s[0] == "_": 
    #        resp['msg'].append('Sample syntax is invalid at sample "%s", the first character must be a letter or a dot' %s)
    #        return resp
    #    if s[0] == "." and s[1].isdigit(): 
    #        resp['msg'].append('Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter' %s)
    #        return resp

    # Insert Validated Input into the database for the project
    fh.seek(0)
    fh.next() # skip header
    print >> sys.stderr, "inserting data"
    count= 0
    batch = []
    start = datetime.datetime.now()
    for line in fh:
        count+=1
        data = prepareAnalysisLine(line)
        analysis = Analysis(project  = project,
                            sample   = data[0],
                            dataset  = data[1],
                            method   = data[2],
                            category = data[3],
                            entity   = data[4],
                            #taxatree = taxas[int(data[5])],
                            taxatree = taxas[data[5]],
                            numreads = int(data[6]),
                            profile  = float(data[7]),
                            avgscore = float(data[8]),
                           )
        batch.append(analysis)
        if (count % 10000 == 0):
            Analysis.objects.bulk_create(batch)
            elapsed = (datetime.datetime.now() - start).seconds
            if elapsed == 0: elapsed = 1
            print >> sys.stderr, "\t%d rows added of %d [about %d seconds remaining]" % (count, num_lines, (num_lines - count)/(count/elapsed))
            batch = []
    if len(batch) > 0:
        Analysis.objects.bulk_create(batch)
    ok = updateSampleCounts(projectID)
    if len(resp['msg']) == 0: resp['ok']=True
    resp['rows'] = count
    fh.close()
    return resp
Beispiel #6
0
def insertAnalysisFromTable(filename,projectID,**kw):
    resp = {'ok': False, 'msg': [], 'rows': 0}

    # Test for needed arguments
    try:
        dataset = kw['dataset']
        method  = kw['method']
        category= kw['category']
    except:
        resp['msg'].append('Missing arguments. The "dataset", "method", and "category" information is required.')
        return resp

    # Test File input
    try:
        fh = open(filename, 'U')
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    # Test project ID input
    try:
        project = Project.objects.get(pk=int(projectID))                                           # attempt to find the project for the given ID
    except Project.DoesNotExist:
        resp["msg"].append('Project "%s" does not exist' % projectID)      # report if the project ID does not exist
        return resp

    header = fh.next().strip().split('\t')
    ctr = 0
    regex = re.compile('[^0-9a-zA-Z_.]') # this is a list of the valid characters
    for row in fh:
        ctr+=1
        line = row.strip().split('\t')
        s = line[0]
        if len(line) != len(header):
            resp["msg"].append('Data row "%d" has a length "%d", which does not match the header length "%d"' % ( ctr, len(line), len(header) ))
        if bool(regex.findall(s)):
            resp["msg"].append('Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed' %s)
        if s[0].isdigit() or s[0] == "_":
            resp["msg"].append('Sample syntax is invalid at sample "%s", the first character must be a letter or a dot' %s)
        if s[0] == "." and s[1].isdigit():
            resp["msg"].append('Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter' %s)
        for i in range(1, len(line)):
            try:
                float(line[i])
            except:
                resp["msg"].append('File "%s" fails validation. Field "%s" contains non-numeric data. Line: %d. Column %d' % (filename, field, ctr, i+1))
    if len(resp['msg']) != 0:
        return resp
    else:
        resp['ok'] = True
        resp['rows'] = ctr
    fh.seek(0)
    fh.next()
    for row in fh:
      line = row.strip().split('\t')
      s = line[0]
      for i in range(1, len(line)):
          entity = header[i]
          value = float(line[i])
          analysis = Analysis(project  = project,
                              sample   = s,
                              dataset  = dataset,
                              method   = method,
                              category = category,
                              entity   = entity,
                              numreads = int(round(value)),
                              profile  = value,
                              avgscore = 1.0,
                             )
          analysis.save()
    ok = updateSampleCounts(projectID)
    return resp
Beispiel #7
0
def insertMetadataFromTable(filename, projectID):
    resp = {'ok': False, 'msg': [], 'rows': 0}

    # Test File input
    try:
        fh = open(filename, 'U')
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    # Test project ID input
    try:
        project = Project.objects.get(pk=int(projectID))                                           # attempt to find the project for the given ID
    except Project.DoesNotExist:
        resp["msg"].append('Project "%s" does not exist' % projectID)      # report if the project ID does not exist
        return resp

    # Once usable data was input check validity of file contents
    header = fh.next().strip().split('\t') 
    
    ctr = 0
    typecheck = dict()
    regex = re.compile('[^0-9a-zA-Z_.]') # this is a list of the valid characters
    for row in fh:
        ctr+=1
        line = row.strip().split('\t')
        s = line[0]
        if len(line) != len(header):
            resp["msg"].append('Data row "%d" has a length "%d", which does not match the header length "%d"' % ( ctr, len(line), len(header) ))
        if bool(regex.findall(s)):
            resp["msg"].append('Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed' %s)
        if s[0].isdigit() or s[0] == "_":
            resp["msg"].append('Sample syntax is invalid at sample "%s", the first character must be a letter or a dot' %s)
        if s[0] == "." and s[1].isdigit():
            resp["msg"].append('Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter' %s)
        if Attributes.objects.filter(project = project, sample = s, field__in=line[1:]).exists():
            resp["msg"].append('Field data exists for sample %s in project %s (%s). Aborting load...' %(line[3], s, projectID, project.name) )
        for i in range(1, len(line)):
            field = header[i]
            value = line[i]
            if field not in typecheck:
                typecheck[field] = getfieldtype(value)
            elif getfieldtype(value) != typecheck[field]:
                resp["msg"].append('File "%s" fails validation. Field "%s" has inconsistent type. Expected a "%s" but found a "%s". Line: %d. Column %d' % (filename, field, typecheck[field], getfieldtype(value), ctr, i+1))

    if len(resp['msg']) != 0:
        return resp
    else:
        resp['ok'] = True
        resp['rows'] = ctr 
    fh.seek(0)
    fh.next()
    for row in fh:
      line = row.strip().split('\t')
      s = line[0]
      for i in range(1, len(line)):
          field = header[i]
          value = line[i]
          value = dateparse(value)
          if isinstance(value, datetime.datetime): value = value.strftime('%Y-%m-%d')
          attributes = Attributes(project  = project,
                                  sample   = s,
                                  category = 'metadata',
                                  field    = field,
                                  value    = value,
                                 )
          if not AttributeInfo.objects.filter(project=project, name=attributes.field).exists():
              newinfo = AttributeInfo(project   = project,
                                      name      = attributes.field,
                                      fieldtype = getfieldtype(attributes.value),
                                      values    = "",
                                     )
              newinfo.save()
          attributes.save()
    fh.close()
    ok = updateSampleCounts(projectID)
    return resp
Beispiel #8
0
def insertMetadataFromFile(filename, projectID):
    resp = {'ok': False, 'msg': [], 'rows': 0}

    # Test File input
    try: 
        #inputdata = read_input(filename)
        fh = open(filename)
    except:
        resp['msg'].append('Error in accessing file "%s"' % (filename))
        return resp

    # Test project ID input
    try: 
        project = Project.objects.get(pk=int(projectID))                                           # attempt to find the project for the given ID
    except Project.DoesNotExist: 
        resp["msg"].append('Project "%s" does not exist' % projectID)      # report if the project ID does not exist
        return resp

    # Once usable data was input check validity of file contents
    #header = inputdata.pop(0)
    header = fh.next().strip().split('\t')
    count = 0
    typecheck = {}
    validsamples = {}
    #for idx, line in enumerate(inputdata):
    for line_str in fh:
        count+=1
        line = line_str.strip().split('\t')
	s = line[0]
        # attribute data has 4 columns
        if len(line) != 4: 
           resp["msg"].append('File "%s" fails validation. Must be 4 columns. Line: %d. Content: %s' % (filename, count, line_str))
           return resp
        #if line[3] not in typecheck: 
        #   typecheck[line[2]] = getfieldtype(line[3])
        #elif getfieldtype(line[3]) != typecheck[line[2]]: 
        #   resp["msg"].append('File "%s" fails validation. Field "%s" has inconsistent type. Expected a "%s" but found a "%s". Line: %d. Content: %s' % (filename, line[2], typecheck[line[2]], getfieldtype(line[3]), count, line))
        # Check if entries for these samples already exist and sample formatting
        #samples = list(set(zip(*inputdata)[0]))
        regex = re.compile('[^0-9a-zA-Z_.]') # this is a list of the valid characters
        #for s in samples:
        if s in validsamples: continue
        if bool(regex.findall(s)): 
            resp["msg"].append('Sample syntax is invalid at sample "%s", only alpha, numeric, underscore, and dot characters are allowed' %s)
        if s[0].isdigit() or s[0] == "_": 
            resp["msg"].append('Sample syntax is invalid at sample "%s", the first character must be a letter or a dot' %s)
        if s[0] == "." and s[1].isdigit(): 
            resp["msg"].append('Sample syntax is invalid at sample "%s", the first character is a dot, so the second must be a letter' %s)
        if Attributes.objects.filter(project = project, sample = s, field=line[3]).exists(): 
            resp["msg"].append('Data exists for field %s for sample %s in project %s (%s). Aborting load...' %(line[3], s, projectID, project.name) )
    if len(resp['msg']) != 0:
        return resp
    else: 
        resp['ok'] = True
        resp['rows'] = count
    fh.seek(0)
    fh.next()
    # Insert Validated Input into the database for the project
    for line in fh:
        data = line.strip().split('\t')
        data[3] = dateparse(data[3])
        if isinstance(data[3], datetime.datetime): data[3] = data[3].strftime('%Y-%m-%d')

        attributes = Attributes(project  = project,
                                sample   = data[0],
                                category = data[1],
                                field    = data[2],
                                value    = data[3],
                               )
        if not AttributeInfo.objects.filter(project=project, name=attributes.field).exists():
            newinfo = AttributeInfo(project   = project,
                                    name      = attributes.field,
                                    fieldtype = getfieldtype(attributes.value),
                                    values    = "",
                                   )
            newinfo.save()
        attributes.save()
    fh.close()
    ok = updateSampleCounts(projectID)
    return resp