Ejemplo n.º 1
0
def check_mapping_data(mapping_data, headers, filename_column):
    """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names

    Also returns a dict of fasta file name: SampleID

    mapping_data:  list of lines of data from mapping file
    headers: list of header strings
    filename_column:  Column of metadata mapping file containing fasta filenames
    """

    # First make sure there is a SampleID and filename_column present
    try:
        sample_id_ix = headers.index("SampleID")
    except ValueError:
        raise ValueError(
            "SampleID column not found in mapping file, please " + "check mapping file with validate_mapping_file.py"
        )

    try:
        filename_col_ix = headers.index(filename_column)
    except ValueError:
        raise ValueError("Specified column %s not found in mapping file." % filename_column)

    valid_mimarks = letters + digits + "."

    fasta_name_to_sample_id = {}

    fasta_names = []
    sample_ids = []
    for line in mapping_data:

        try:
            fasta_name_to_sample_id[basename(line[filename_col_ix].strip())] = line[sample_id_ix]
        except IndexError:
            raise IndexError("Missing filename column data in line %s " % line)

        for curr_char in line[sample_id_ix]:
            if curr_char not in valid_mimarks:
                raise ValueError(
                    "Found invalid character in line: %s\n" % line
                    + "SampleIDs must be alphanumeric and . characters "
                    + "only"
                )
        sample_ids.append(line[sample_id_ix].strip())
        fasta_names.append(line[filename_col_ix].strip())

    fasta_name_dups = duplicates_indices(fasta_names)
    if fasta_name_dups:
        raise ValueError(
            "Found duplicate fasta names: %s" % "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()])
        )

    sample_id_dups = duplicates_indices(sample_ids)
    if sample_id_dups:
        raise ValueError(
            "Found duplicate SampleID names: %s" % "\t".join([sample_id for sample_id in sample_id_dups.keys()])
        )

    return fasta_name_to_sample_id
Ejemplo n.º 2
0
def check_mapping_data(mapping_data, headers, filename_column):
    """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names

    Also returns a dict of fasta file name: SampleID

    mapping_data:  list of lines of data from mapping file
    headers: list of header strings
    filename_column:  Column of metadata mapping file containing fasta filenames
    """

    # First make sure there is a SampleID and filename_column present
    try:
        sample_id_ix = headers.index("SampleID")
    except ValueError:
        raise ValueError("SampleID column not found in mapping file, please " +
                         "check mapping file with validate_mapping_file.py")

    try:
        filename_col_ix = headers.index(filename_column)
    except ValueError:
        raise ValueError("Specified column %s not found in mapping file." %
                         filename_column)

    valid_mimarks = letters + digits + "."

    fasta_name_to_sample_id = {}

    fasta_names = []
    sample_ids = []
    for line in mapping_data:

        try:
            fasta_name_to_sample_id[basename(line[filename_col_ix].strip())] =\
                line[sample_id_ix]
        except IndexError:
            raise IndexError("Missing filename column data in line %s " % line)

        for curr_char in line[sample_id_ix]:
            if curr_char not in valid_mimarks:
                raise ValueError(
                    "Found invalid character in line: %s\n" % line +
                    "SampleIDs must be alphanumeric and . characters " +
                    "only")
        sample_ids.append(line[sample_id_ix].strip())
        fasta_names.append(line[filename_col_ix].strip())

    fasta_name_dups = duplicates_indices(fasta_names)
    if fasta_name_dups:
        raise ValueError(
            "Found duplicate fasta names: %s" %
            "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()]))

    sample_id_dups = duplicates_indices(sample_ids)
    if sample_id_dups:
        raise ValueError(
            "Found duplicate SampleID names: %s" %
            "\t".join([sample_id for sample_id in sample_id_dups.keys()]))

    return fasta_name_to_sample_id
Ejemplo n.º 3
0
def check_fixed_len_bcs_dups(header,
                             mapping_data,
                             errors):
    """ Checks barcodes of same length for duplicates, adds to errors if found

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    """

    header_field_to_check = "BarcodeSequence"

    # Skip if no field BarcodeSequence
    try:
        check_ix = header.index(header_field_to_check)
    except ValueError:
        return errors

    barcodes = []

    correction = 1

    for curr_data in mapping_data:
        barcodes.append(upper(curr_data[check_ix]))

    dups = duplicates_indices(barcodes)

    for curr_dup in dups:
        for curr_loc in dups[curr_dup]:
            errors.append('Duplicate barcode %s found.\t%d,%d' %
                          (curr_dup, curr_loc + correction, check_ix))

    return errors
Ejemplo n.º 4
0
def check_fixed_len_bcs_dups(header,
                             mapping_data,
                             errors):
    """ Checks barcodes of same length for duplicates, adds to errors if found
    
    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    """
    
    header_field_to_check = "BarcodeSequence"
    
    # Skip if no field BarcodeSequence 
    try:
        check_ix = header.index(header_field_to_check)
    except ValueError:
        return errors
        
    barcodes = []
    
    correction = 1
    
    for curr_data in mapping_data:
        barcodes.append(curr_data[check_ix])
    
    dups = duplicates_indices(barcodes)
    
    for curr_dup in dups:
        for curr_loc in dups[curr_dup]:
            errors.append('Duplicate barcode %s found.\t%d,%d' %\
             (curr_dup, curr_loc + correction, check_ix))
    
    return errors
Ejemplo n.º 5
0
def check_variable_len_bcs_dups(header, mapping_data, errors):
    """ Checks variable length barcodes plus sections of primers for dups
    
    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    """

    header_field_to_check = "BarcodeSequence"

    # Skip if no field BarcodeSequence
    try:
        check_ix = header.index(header_field_to_check)
    except ValueError:
        return errors

    linker_primer_field = "LinkerPrimerSequence"

    try:
        linker_primer_ix = header.index(linker_primer_field)
        no_primers = False
    except ValueError:
        no_primers = True

    barcodes = []
    bc_lens = []

    correction = 1

    for curr_data in mapping_data:
        barcodes.append(upper(curr_data[check_ix]))
        bc_lens.append(len(curr_data[check_ix]))

    # Get max length of barcodes to determine how many primer bases to slice
    barcode_max_len = max(bc_lens)

    # Have to do second pass to append correct number of nucleotides to
    # check for duplicates between barcodes and primer sequences

    bcs_added_nts = []
    for curr_data in mapping_data:
        if no_primers:
            bcs_added_nts.append(upper(curr_data[check_ix]))
        else:
            adjusted_len = barcode_max_len - len(curr_data[check_ix])
            bcs_added_nts.append(upper(curr_data[check_ix] + curr_data[linker_primer_ix][0:adjusted_len]))

    dups = duplicates_indices(bcs_added_nts)

    for curr_dup in dups:
        for curr_loc in dups[curr_dup]:
            if no_primers:
                errors.append("Duplicate barcode %s found.\t%d,%d" % (curr_dup, curr_loc + correction, check_ix))
            else:
                errors.append(
                    "Duplicate barcode and primer fragment sequence "
                    + "%s found.\t%d,%d" % (curr_dup, curr_loc + correction, check_ix)
                )

    return errors
Ejemplo n.º 6
0
def check_variable_len_bcs_dups(header, mapping_data, errors):
    """ Checks variable length barcodes plus sections of primers for dups
    
    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    """

    header_field_to_check = "BarcodeSequence"

    # Skip if no field BarcodeSequence
    try:
        check_ix = header.index(header_field_to_check)
    except ValueError:
        return errors

    linker_primer_field = "LinkerPrimerSequence"

    try:
        linker_primer_ix = header.index(linker_primer_field)
        no_primers = False
    except ValueError:
        no_primers = True

    barcodes = []
    bc_lens = []

    correction = 1

    for curr_data in mapping_data:
        barcodes.append(upper(curr_data[check_ix]))
        bc_lens.append(len(curr_data[check_ix]))

    # Get max length of barcodes to determine how many primer bases to slice
    barcode_max_len = max(bc_lens)

    # Have to do second pass to append correct number of nucleotides to
    # check for duplicates between barcodes and primer sequences

    bcs_added_nts = []
    for curr_data in mapping_data:
        if no_primers:
            bcs_added_nts.append(upper(curr_data[check_ix]))
        else:
            adjusted_len = barcode_max_len - len(curr_data[check_ix])
            bcs_added_nts.append(upper(curr_data[check_ix] +\
             curr_data[linker_primer_ix][0:adjusted_len]))

    dups = duplicates_indices(bcs_added_nts)

    for curr_dup in dups:
        for curr_loc in dups[curr_dup]:
            if no_primers:
                errors.append('Duplicate barcode %s found.\t%d,%d' %\
                 (curr_dup, curr_loc + correction, check_ix))
            else:
                errors.append('Duplicate barcode and primer fragment sequence '+\
                 '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, check_ix))

    return errors
Ejemplo n.º 7
0
def check_sampleid_duplicates(header,
                              mapping_data,
                              errors):
    """ Flags duplicate, missing SampleIDs as errors

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    """

    sample_id_field = "SampleID"
    correction = 1

    try:
        sample_id_ix = header.index(sample_id_field)
    except ValueError:
        # Skip out at this point, header check will have error for missing
        # field
        return errors

    sample_ids = []

    # Need to save locations of missing IDs so they aren't flagged twice
    missing_sample_ids = []

    for curr_data in range(len(mapping_data)):
        if len(mapping_data[curr_data][sample_id_ix]) == 0:
            errors.append('Missing SampleID.\t%d,%d' %
                          (curr_data + correction, sample_id_ix))
            missing_sample_ids.append(curr_data + correction)
        sample_ids.append(mapping_data[curr_data][sample_id_ix])

    dups = duplicates_indices(sample_ids)

    for curr_dup in dups:
        for curr_loc in dups[curr_dup]:
            if (curr_loc + correction) not in missing_sample_ids:
                errors.append('Duplicate SampleID %s found.\t%d,%d' %
                              (curr_dup, curr_loc + correction, sample_id_ix))

    return errors
Ejemplo n.º 8
0
def check_sampleid_duplicates(header,
                              mapping_data,
                              errors):
    """ Flags duplicate, missing SampleIDs as errors
    
    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    """
    
    sample_id_field = "SampleID"
    correction = 1
    
    try:
        sample_id_ix = header.index(sample_id_field)
    except ValueError:
        # Skip out at this point, header check will have error for missing
        # field
        return errors
        
    sample_ids = []
    
    # Need to save locations of missing IDs so they aren't flagged twice
    missing_sample_ids = []
    
    for curr_data in range(len(mapping_data)):
        if len(mapping_data[curr_data][sample_id_ix]) == 0:
            errors.append('Missing SampleID.\t%d,%d' %\
             (curr_data + correction, sample_id_ix))
            missing_sample_ids.append(curr_data + correction)
        sample_ids.append(mapping_data[curr_data][sample_id_ix])
        
    dups = duplicates_indices(sample_ids)
    
    for curr_dup in dups:
        for curr_loc in dups[curr_dup]:
            if (curr_loc + correction) not in missing_sample_ids:
                errors.append('Duplicate SampleID %s found.\t%d,%d' %\
                 (curr_dup, curr_loc + correction, sample_id_ix))
              
    return errors
Ejemplo n.º 9
0
def check_mapping_data(mapping_data):
    """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names
    
    Also returns a dict of fasta file name: SampleID
    
    mapping_data:  list of lines of data from mapping file
    """
    
    valid_mimarks = letters + digits + "."
    
    fasta_name_to_sample_id = {}

    fasta_names = []
    for line in mapping_data:
        curr_line = line.strip().split('\t')
        if not curr_line or line.startswith("#"):
            continue
        try:
            fasta_name_to_sample_id[basename(curr_line[1].strip())] =\
             curr_line[0]
        except IndexError:
            raise IndexError,("Found non-tab separated line in mapping "+\
             "data.  Offending line is: %s" % line)
        for curr_char in curr_line[0]:
            if curr_char not in valid_mimarks:
                raise ValueError,("Found invalid character in line: %s\n" %\
                 line + "SampleIDs must be alphanumeric and . characters "+\
                 "only")
        fasta_names.append(curr_line[1].strip())
       
    fasta_name_dups = duplicates_indices(fasta_names)
    if fasta_name_dups:
        raise ValueError,("Found duplicate fasta names: %s" %\
         "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()]))
         
    return fasta_name_to_sample_id
Ejemplo n.º 10
0
def check_mapping_data(mapping_data):
    """ Checks mapping data for MIMARKS SampleIDs, unique IDs, fasta file names
    
    Also returns a dict of fasta file name: SampleID
    
    mapping_data:  list of lines of data from mapping file
    """
    
    valid_mimarks = letters + digits + "."
    
    fasta_name_to_sample_id = {}

    fasta_names = []
    for line in mapping_data:
        curr_line = line.strip().split('\t')
        if not curr_line or line.startswith("#"):
            continue
        try:
            fasta_name_to_sample_id[basename(curr_line[1].strip())] =\
             curr_line[0]
        except IndexError:
            raise IndexError,("Found non-tab separated line in mapping "+\
             "data.  Offending line is: %s" % line)
        for curr_char in curr_line[0]:
            if curr_char not in valid_mimarks:
                raise ValueError,("Found invalid character in line: %s\n" %\
                 line + "SampleIDs must be alphanumeric and . characters "+\
                 "only")
        fasta_names.append(curr_line[1].strip())
       
    fasta_name_dups = duplicates_indices(fasta_names)
    if fasta_name_dups:
        raise ValueError,("Found duplicate fasta names: %s" %\
         "\t".join([fasta_name for fasta_name in fasta_name_dups.keys()]))
         
    return fasta_name_to_sample_id
Ejemplo n.º 11
0
def check_added_demultiplex_dups(header,
                                 mapping_data,
                                 errors,
                                 has_barcodes=True,
                                 added_demultiplex_field=None):
    """ Checks that all barcodes and added demultiplex fields are unique

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    has_barcodes:  True if barcode fields are to be used.
    added_demultiplex_field:  If specified, references a field in the mapping
     file to use for demultiplexing.  These are to be read from fasta labels
     during the actual demultiplexing step.  All combinations of barcodes,
     primers, and the added_demultiplex_field must be unique.
    """

    # Treat as variable length to test combinations of barcodes and the
    # added demultiplex field (should return the same result for the barcode
    # component)
    correction = 1

    header_field_to_check = "BarcodeSequence"
    bc_found = False

    # Skip if no field BarcodeSequence
    if has_barcodes:
        try:
            bc_ix = header.index(header_field_to_check)
            bc_found = True
        except ValueError:
            pass

    linker_primer_field = "LinkerPrimerSequence"

    try:
        linker_primer_ix = header.index(linker_primer_field)
        no_primers = False
    except ValueError:
        no_primers = True

    try:
        added_demultiplex_ix = header.index(added_demultiplex_field)
    except ValueError:
        # Skip out at this point, header check will have error for missing
        # field
        return errors

    barcodes = []
    bc_lens = []
    bcs_added_field = []

    if has_barcodes and bc_found:
        for curr_data in mapping_data:
            barcodes.append(upper(curr_data[bc_ix]))
            bc_lens.append(len(curr_data[bc_ix]))

        # Get max length of barcodes to determine how many primer bases to
        # slice
        barcode_max_len = max(bc_lens)

        # Have to do second pass to append correct number of nucleotides to
        # check for duplicates between barcodes and primer sequences

        for curr_data in mapping_data:
            if no_primers:
                bcs_added_field.append(curr_data[bc_ix] +
                                       curr_data[added_demultiplex_ix])
            else:
                adjusted_len = barcode_max_len - len(curr_data[bc_ix])
                bcs_added_field.append(curr_data[bc_ix] +
                                       curr_data[linker_primer_ix][0:adjusted_len] +
                                       curr_data[added_demultiplex_ix])
    else:
        for curr_data in mapping_data:
            bcs_added_field.append(curr_data[added_demultiplex_ix])

    dups = duplicates_indices(bcs_added_field)

    for curr_dup in dups:
        if has_barcodes and bc_found:
            for curr_loc in dups[curr_dup]:
                errors.append('Duplicate barcode and added demultiplex field ' +
                              '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, bc_ix))
        else:
            for curr_loc in dups[curr_dup]:
                errors.append('Duplicate added demultiplex field ' +
                              '%s found.\t%d,%d' % (curr_dup, curr_loc + correction,
                                                    added_demultiplex_ix))

    return errors
Ejemplo n.º 12
0
def check_added_demultiplex_dups(header,
                                 mapping_data,
                                 errors,
                                 has_barcodes=True,
                                 added_demultiplex_field=None):
    """ Checks that all barcodes and added demultiplex fields are unique
    
    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    has_barcodes:  True if barcode fields are to be used.
    added_demultiplex_field:  If specified, references a field in the mapping
     file to use for demultiplexing.  These are to be read from fasta labels
     during the actual demultiplexing step.  All combinations of barcodes,
     primers, and the added_demultiplex_field must be unique.
    """
    
    # Treat as variable length to test combinations of barcodes and the
    # added demultiplex field (should return the same result for the barcode
    # component)
    correction = 1
    
    header_field_to_check = "BarcodeSequence"
    bc_found = False
    
    # Skip if no field BarcodeSequence 
    if has_barcodes:
        try:
            bc_ix = header.index(header_field_to_check)
            bc_found = True
        except ValueError:
            pass
        
    linker_primer_field = "LinkerPrimerSequence"
    
    try:
        linker_primer_ix = header.index(linker_primer_field)
        no_primers = False
    except ValueError:
        no_primers = True
        
    try:
        added_demultiplex_ix = header.index(added_demultiplex_field)
    except ValueError:
        # Skip out at this point, header check will have error for missing
        # field
        return errors
    
    barcodes = []
    bc_lens = []
    bcs_added_field = []
    
    if has_barcodes and bc_found:
        for curr_data in mapping_data:
            barcodes.append(curr_data[bc_ix])
            bc_lens.append(len(curr_data[bc_ix]))
    
        # Get max length of barcodes to determine how many primer bases to slice
        barcode_max_len = max(bc_lens)
    
        # Have to do second pass to append correct number of nucleotides to 
        # check for duplicates between barcodes and primer sequences
    
        
        for curr_data in mapping_data:
            if no_primers:
                bcs_added_field.append(curr_data[bc_ix] +\
                 curr_data[added_demultiplex_ix])
            else:
                adjusted_len = barcode_max_len - len(curr_data[bc_ix])
                bcs_added_field.append(curr_data[bc_ix] +\
                 curr_data[linker_primer_ix][0:adjusted_len] +\
                 curr_data[added_demultiplex_ix])
    else:
        for curr_data in mapping_data:
            bcs_added_field.append(curr_data[added_demultiplex_ix])
    

    dups = duplicates_indices(bcs_added_field)

    for curr_dup in dups:
        if has_barcodes and bc_found:
            for curr_loc in dups[curr_dup]:
                errors.append('Duplicate barcode and added demultiplex field '+\
                  '%s found.\t%d,%d' % (curr_dup, curr_loc + correction, bc_ix))
        else:
            for curr_loc in dups[curr_dup]:
                errors.append('Duplicate added demultiplex field '+\
                 '%s found.\t%d,%d' % (curr_dup, curr_loc + correction,
                 added_demultiplex_ix))
              
    return errors