Ejemplo n.º 1
0
def create_m5_reference(m5_file):
    log.info('Parsing Blasr M5 results from "{0}"'.format(m5_file))
    results = {}
    diffs = {}
    for record in BlasrReader(m5_file):
        qname = get_base_sequence_name(record.qname)
        tname = get_base_sequence_name(record.tname)
        diff_count = int(record.nmis) + int(record.nins) + int(record.ndel)
        if qname not in diffs:
            results[qname] = tname
            diffs[qname] = diff_count
        elif diffs[qname] > diff_count:
            results[qname] = tname
            diffs[qname] = diff_count
    log.info('Finished reading Blasr results')
    return results
Ejemplo n.º 2
0
def create_m1_reference( m1_file, reference=None ):
    log.info('Parsing Blasr M1 results from "{0}"'.format( m1_file ))
    results = {}
    for record in BlasrReader( m1_file ):
        qname = get_base_sequence_name( record.qname )
        tname = get_base_sequence_name( record.tname )
        if qname in results:
            msg = 'Duplicate sequence ids found! "{0}"'.format( qname )
            log.info( msg )
            raise KeyError( msg )
        if reference:
            results[qname] = reference[tname]
        else:
            results[qname] = tname
    log.info('Finished reading Blasr results')
    return results
Ejemplo n.º 3
0
def create_m5_reference( m5_file ):
    log.info('Parsing Blasr M5 results from "{0}"'.format( m5_file ))
    results = {}
    diffs = {}
    for record in BlasrReader( m5_file ):
        qname = get_base_sequence_name( record.qname )
        tname = get_base_sequence_name( record.tname )
        diff_count = int(record.nmis) + int(record.nins) + int(record.ndel)
        if qname not in diffs:
            results[qname] = tname
            diffs[qname] = diff_count
        elif diffs[qname] > diff_count:
            results[qname] = tname
            diffs[qname] = diff_count
    log.info('Finished reading Blasr results')
    return results
Ejemplo n.º 4
0
def create_m1_reference(m1_file, reference=None):
    log.info('Parsing Blasr M1 results from "{0}"'.format(m1_file))
    results = {}
    for record in BlasrReader(m1_file):
        qname = get_base_sequence_name(record.qname)
        tname = get_base_sequence_name(record.tname)
        if qname in results:
            msg = 'Duplicate sequence ids found! "{0}"'.format(qname)
            log.info(msg)
            raise KeyError(msg)
        if reference:
            results[qname] = reference[tname]
        else:
            results[qname] = tname
    log.info('Finished reading Blasr results')
    return results
Ejemplo n.º 5
0
def append_typing_results(summary_file, combined_typings, output_file):
    typing_header = '\tGenType\tGenPctId\tExonType\tExonPctId\tType\t'
    with open(output_file, 'w') as output:
        with open(summary_file, 'r') as handle:
            header = handle.next().strip()
            output.write(header + typing_header)
            for line in handle:
                parts = line.strip().split()
                name = get_base_sequence_name(parts[1])
                parts += combined_typings[name]
                output.write('\t'.join(parts) + '\n')
Ejemplo n.º 6
0
def _parse_blasr_alignment( blasr_file ):
    results = {}
    for entry in BlasrReader( blasr_file ):
        name = get_base_sequence_name( entry.qname )
        if isinstance(entry, BlasrM1):
            results[name] = [entry.tname, entry.pctsimilarity]
        elif isinstance(entry, BlasrM5):
            diffs = int(entry.nmis) + int(entry.nins) + int(entry.ndel)
            pctid = 100 * int(entry.nmat) / float(int(entry.nmat) + diffs)
            results[name] = [entry.tname, pctid]
    return results
Ejemplo n.º 7
0
def _parse_blasr_alignment(blasr_file):
    results = {}
    for entry in BlasrReader(blasr_file):
        name = get_base_sequence_name(entry.qname)
        if isinstance(entry, BlasrM1):
            results[name] = [entry.tname, entry.pctsimilarity]
        elif isinstance(entry, BlasrM5):
            diffs = int(entry.nmis) + int(entry.nins) + int(entry.ndel)
            pctid = 100 * int(entry.nmat) / float(int(entry.nmat) + diffs)
            results[name] = [entry.tname, pctid]
    return results
Ejemplo n.º 8
0
def append_typing_results( summary_file, combined_typings, output_file):
    typing_header = '\tGenType\tGenPctId\tExonType\tExonPctId\tType\t'
    with open(output_file, 'w') as output:
        with open(summary_file, 'r') as handle:
            header = handle.next().strip()
            output.write(header + typing_header)
            for line in handle:
                parts = line.strip().split()
                name = get_base_sequence_name( parts[1] )
                parts += combined_typings[name]
                output.write('\t'.join(parts) + '\n')
Ejemplo n.º 9
0
def parse_typing( typing_file ):
    results = {}
    with open( typing_file) as handle:
        for line in handle:
            if line.startswith('Locus'):
                continue
            parts = line.strip().split()
            name = get_base_sequence_name( parts[1] )
            typing = parts[2]
            pctid = parts[5]
            results[name] = [typing, pctid]
    return results
Ejemplo n.º 10
0
def separate_listed_sequences( fasta_file, good_values, good_output, bad_output ):
    """
    Separate a fasta file into two based on a supplied value list
    """
    with FastaWriter( good_output ) as good_handle:
        with FastaWriter( bad_output ) as bad_handle:
            for record in FastaReader( fasta_file ):
                name = get_base_sequence_name( record.name )
                if name in good_values:
                    good_handle.writeRecord( record )
                else:
                    bad_handle.writeRecord( record )
Ejemplo n.º 11
0
def parse_typing(typing_file):
    results = {}
    with open(typing_file) as handle:
        for line in handle:
            if line.startswith('Locus'):
                continue
            parts = line.strip().split()
            name = get_base_sequence_name(parts[1])
            typing = parts[2]
            pctid = parts[5]
            results[name] = [typing, pctid]
    return results
Ejemplo n.º 12
0
def separate_aligned_sequences( fasta_file, dictionary, good_values, good_output, bad_output ):
    """
    Separate a fasta file into two based on a supplied dictionary and value list
    """
    with FastaWriter( good_output ) as good_handle:
        with FastaWriter( bad_output ) as bad_handle:
            for record in FastaReader( fasta_file ):
                name = get_base_sequence_name( record.name )
                value = dictionary.get(name, "Unmapped")
                if value in good_values:
                    good_handle.writeRecord( record )
                else:
                    bad_handle.writeRecord( record )
Ejemplo n.º 13
0
def separate_listed_sequences(fasta_file, good_values, good_output,
                              bad_output):
    """
    Separate a fasta file into two based on a supplied value list
    """
    with FastaWriter(good_output) as good_handle:
        with FastaWriter(bad_output) as bad_handle:
            for record in FastaReader(fasta_file):
                name = get_base_sequence_name(record.name)
                if name in good_values:
                    good_handle.writeRecord(record)
                else:
                    bad_handle.writeRecord(record)
Ejemplo n.º 14
0
def separate_aligned_sequences(fasta_file, dictionary, good_values,
                               good_output, bad_output):
    """
    Separate a fasta file into two based on a supplied dictionary and value list
    """
    with FastaWriter(good_output) as good_handle:
        with FastaWriter(bad_output) as bad_handle:
            for record in FastaReader(fasta_file):
                name = get_base_sequence_name(record.name)
                value = dictionary.get(name, "Unmapped")
                if value in good_values:
                    good_handle.writeRecord(record)
                else:
                    bad_handle.writeRecord(record)
Ejemplo n.º 15
0
def create_sam_reference( sam_file, reference=None ):
    log.info('Parsing SAM alignments from "{0}"'.format(sam_file))
    results = {}
    for record in SamReader(sam_file):
        name = get_base_sequence_name( record.rname )
        if record.qname in results:
            msg = 'Duplicate sequence ids found! "{0}"'.format( record.qname )
            log.info( msg )
            raise KeyError( msg )
        if reference:
            results[record.qname] = reference[name]
        else:
            results[record.qname] = name
    log.info('Finished reading SAM file results')
    return results
Ejemplo n.º 16
0
def create_sam_reference(sam_file, reference=None):
    log.info('Parsing SAM alignments from "{0}"'.format(sam_file))
    results = {}
    for record in SamReader(sam_file):
        name = get_base_sequence_name(record.rname)
        if record.qname in results:
            msg = 'Duplicate sequence ids found! "{0}"'.format(record.qname)
            log.info(msg)
            raise KeyError(msg)
        if reference:
            results[record.qname] = reference[name]
        else:
            results[record.qname] = name
    log.info('Finished reading SAM file results')
    return results
Ejemplo n.º 17
0
def separate_sequences( fasta_file, dictionary, prefix='' ):
    """
    Separate a fasta file into multiple groups based on some dict
    """
    file_handles = {}
    for record in FastaReader( fasta_file ):
        name = get_base_sequence_name( record.name )
        group = dictionary.get( name, "Unmapped" )
        group_file = prefix + '_' + group + '.fasta'
        try:
            file_handles[group_file].writeRecord( record )
        except KeyError:
            file_handles[group_file] = FastaWriter( group_file )
            file_handles[group_file].writeRecord( record )
    return closed_file_handles( file_handles )
Ejemplo n.º 18
0
def separate_sequences(fasta_file, dictionary, prefix=''):
    """
    Separate a fasta file into multiple groups based on some dict
    """
    file_handles = {}
    for record in FastaReader(fasta_file):
        name = get_base_sequence_name(record.name)
        group = dictionary.get(name, "Unmapped")
        group_file = prefix + '_' + group + '.fasta'
        try:
            file_handles[group_file].writeRecord(record)
        except KeyError:
            file_handles[group_file] = FastaWriter(group_file)
            file_handles[group_file].writeRecord(record)
    return closed_file_handles(file_handles)