def flag_dict(flag): return dict((meaning, eval_flag(bit, flag)) for bit, meaning in flag_meanings.items()) def split_list(A, idx): return A[:idx], A[idx:] sam_columns = ("QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL") #optiosn #TODO: get_record function takes a filehandle and returns a single record via SeqIO, etc. #So functions expect a dictionary I guess #pass parse_options = compose(dict, pmap(parse_option)) #, tabsplit) #readfields = compose(tabsplit, next) line_to_dict = compose_all(dict, partial(zip, sam_columns)) #, tabsplit) validated_dict = compose(basic_schema.validate, line_to_dict) fields_and_options = compose(partial2(split_list, len(sam_columns)), tabsplit) parsers = partial(fzip, [validated_dict, parse_options]) parse_fields_and_options = compose(parsers, fields_and_options) all_but_cigar_dict = starcompose(merge_dicts, parse_fields_and_options) get_cigar_dict = compose(parse_cigar, itemgetter('CIGAR')) get_flag_dict = compose(flag_dict, itemgetter('FLAG')) get_error = compose(sanger_qual_str_to_error, itemgetter('QUAL')) def load_sam(fh): dicts = map(get_row, ifilter(bool, fh.read().split('\n'))) return pd.DataFrame(dicts) #TODO: do we really need indices? it complicates querying i tlooks like maybe where plays better with them # .set_index(index) #, index=index, columns=columns) def get_row(row):
def obj_to_dict(obj, names_getters_map): apply_to_obj = partial2(apply_to_object, obj) return dictmap(apply_to_obj, names_getters_map)
pcompose = partial(partial, compose) error_from_ints = pcompose(error) #sanger_qual_str_to_error = cmperror(qual_to_phreds) ''' get_fastq = partial(SeqIO.parse, format='fastq') get_fasta = partial(SeqIO.parse, format='fasta') to_np_int = partial(np.array, dtype=int) gccontent = compose(ilen, pifilter('GC'.__contains__)) minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) ''' Error = 10^-(Phred/10) ''' qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) #SANGER_OFFSET = 33 ''' assert len(quality) == len(error) == len(phred_scores) ''' #validate = scheme.validate
''' pcompose = partial(partial, compose) error_from_ints = pcompose(error) #sanger_qual_str_to_error = cmperror(qual_to_phreds) ''' get_fastq = partial(SeqIO.parse, format='fastq') get_fasta = partial(SeqIO.parse, format='fasta') to_np_int = partial(np.array, dtype=int) gccontent = compose(ilen, pifilter('GC'.__contains__)) minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) ''' Error = 10^-(Phred/10) ''' qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) #SANGER_OFFSET = 33 ''' assert len(quality) == len(error) == len(phred_scores) ''' #validate = scheme.validate #TODO: could make these validations match samtools spec #TODO: Could treat options/cigar string as their own class with their own parsing and validation. def flatten_vcf(record):
import operator as op from operator import add, div from schema import Schema, Use from itertools import ifilter # Parse options #from pyparsing import Regex to_np_int = partial(np.array, dtype=int) parse_array = compose_all(to_np_int, psplit(','), pstrip('[]')) tabsplit = psplit('\t') minus33 = partial(add, -33) qual_int_sanger = compose(minus33, ord) qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger)) error = compose(partial(pow, 10), partial2(div, -10.0)) #don't need to map because numpy vectorizes it automatically #TODO: handle non-sanger version sanger_qual_str_to_error = compose(error, qual_to_phreds) basic_scheme={ 'QNAME' : str, 'FLAG' : int, 'RNAME' : str, 'POS' : int, 'MAPQ' : int, 'CIGAR' : str, 'RNEXT' : str, 'PNEXT' : int, 'TLEN' : int, #'MRNM' : str,