def __validate_input(ref, seq): if len(seq) != ref.length_with_deletions(): raise weCallException( "Sequence has to be of the same length as reference.") if not re.match(r'^[ACGTURYKMSWBDHVN\*\.]*\Z', seq): raise weCallException( "Illegal character in sequence {!r}".format(seq))
def __validate_input(ref_char, seq_char, qual_char): if not all(len(c) == 1 for c in [ref_char, seq_char, qual_char]): raise weCallException( "All characters at sequence position has to be of length 1.") if ref_char == MISSING_BASE: raise weCallException("Missing reference character.")
def __validate_character_combination(self): if self.ref_char == DELETED_BASE and self.seq_char == MATCHING_BASE: raise weCallException( "Invalid character combination: ref char = {}, sequence char = {}" .format(self.ref_char, self.seq_char)) if self.seq_char == DELETED_BASE and self.qual_char != MISSING_BASE: raise weCallException( "Cannot assign base quality to a deleted base.") if self.is_gap and self.qual_char != MISSING_BASE: raise weCallException("Cannot assign base quality inside a gap.")
def add_sample_data(self, sample_name, key_name, sample_data_value): if key_name not in self.__key_to_sample_values: raise weCallException( "Missing key {} when adding sample data.".format(key_name)) if sample_name not in self.__sample_names: raise weCallException( "Missing sample name {} supplied when adding sample data.". format(sample_name)) if key_name == GENOTYPE_KEY and not isinstance(sample_data_value, GenotypeCall): raise weCallException("Genotype field must be a GenotypeCall.") self.__key_to_sample_values[key_name][self.__sample_names.index( sample_name)] = sample_data_value
def read_interval(interval_string): start_string, end_string = tuple(interval_string.split("-")) start, end = int(start_string), int(end_string) if end <= start: raise weCallException( "Interval {} does not have start < end".format(interval_string)) return Interval(start, end)
def get_variant_support(self, sample_name): for key in list(VARIANT_SUPPORT_MAP.keys()): if self.has_genotype_key(key): return VARIANT_SUPPORT_MAP[key](self.get_field( sample_name, key)) raise weCallException( "Expected one of {} as the variant support key.".format( list(VARIANT_SUPPORT_MAP.keys())))
def __init__(self, quality_string, quality_mapping=QUALITY_MAPPING): if not SequenceQuality.is_valid_qual(quality_string): raise weCallException( "Illegal character in the quality string {!r}".format( quality_string)) self.quality_mapping = quality_mapping self.ascii_quality = self.parse_quality_to_ascii(quality_string)
def add_sample_name(self, sample_name): if sample_name in self.__samples: raise weCallException( "Sample {} already exists in the SampleBank.".format( sample_name)) sequence_bank = SequenceBank(self.reference) self.__samples[sample_name] = sequence_bank return sequence_bank
def index(self): tool_runner = ToolRunner() tool_runner.start( [os.path.join(os.environ['WECALL_BIN'], "samtools"), "faidx", self.filename]) if tool_runner.return_code != 0: raise weCallException("") else: return self
def __get_expected_calls_from_haplotypes(ascii_strings, reference): if len(ascii_strings) != 2: raise weCallException( "Expected calls have to be defined as a diploid.") if not all(len(str) == reference.length_with_deletions() for str in ascii_strings): raise weCallException( "Ascii haplotypes have to be of the same length as the reference") vars_from_hap1 = Sequence(reference, ascii_strings[0]).variants vars_from_hap2 = Sequence(reference, ascii_strings[1]).variants calls = {} for var in vars_from_hap1.intersection(vars_from_hap2): calls[var] = GenotypeCall("1/1") for var in vars_from_hap1.symmetric_difference(vars_from_hap2): calls[var] = GenotypeCall("0/1") return calls
def trimmed_vcf_ref_alt(ref, alt): if len(ref) == 0 or len(alt) == 0: raise weCallException("VCF format requires non-empty ref and alt") if ref == alt and len(ref) > 1: raise weCallException("VCF requires refcalls of length 1") if alt == UNKNOWN or ref == UNKNOWN: # VCF allows this to indicate unknown data. raise weCallException("not dealing with monomorphic variants") offset, new_ref, new_alt = trimmed_ref_alt(ref, alt) start_context, end_context = 0, 0 if len(ref) != len(alt) or (not new_ref and not new_alt): if offset == 0: end_context = 1 else: start_context = 1 result_ref =\ ref[offset - start_context:offset] +\ new_ref +\ ref[offset + len(new_ref):offset + len(new_ref) + end_context] result_alt =\ alt[offset - start_context:offset] +\ new_alt +\ alt[offset + len(new_alt):offset + len(new_alt) + end_context] return offset - start_context, result_ref, result_alt
def read_records(schema, line): """ Extracts a sequence of `Record` objects from a single line in a VCF file. """ try: cols = [l for l in line.strip().split("\t")] for item in generate_records(schema, cols): yield item except weCallException: raise except Exception: _, exc, tb = sys.exc_info() new_exc = weCallException( "while reading record from line {!r}: {!s}".format( line, exc.message)) raise new_exc.__class__(new_exc).with_traceback(tb)
def __potentially_merge_adjacent_variants(var_1, var_2): if var_1 is None or var_2 is None or var_1.type != var_2.type: return var_1, var_2 else: if var_1.type == TYPE_SNP or var_1.type == TYPE_REF: return var_1, var_2 elif var_1.type == TYPE_DEL: merged_variant = Variant(var_1.chrom, var_1.pos_from, var_1.ref + var_2.ref[-1], var_1.alt) return None, merged_variant elif var_1.type == TYPE_INS: merged_variant = Variant(var_1.chrom, var_1.pos_from, var_1.ref, var_1.alt + var_2.alt[-1]) return None, merged_variant else: raise weCallException("Unexpected variant type: " + TYPE_TO_STR[var_1.type])
def get_genotype_likelihoods(self, sample_name): def convert_likelihoods(likelihoods, factor): if likelihoods is None or likelihoods == '.': return likelihoods else: return [ None if value in {None, '.'} else value / factor for value in likelihoods ] for key in list(LIKELIHOOD_SCALING_FACTOR.keys()): if self.has_genotype_key(key): values = self.get_field(sample_name, key) return convert_likelihoods(values, LIKELIHOOD_SCALING_FACTOR[key]) raise weCallException( "Expected one of {} as the likelihood key.".format( list(LIKELIHOOD_SCALING_FACTOR.keys())))
def _get_variants(self): variants = set() ref_index = self.pos_from - 1 current_variant = None for ref_char, alt_char in zip(self._reference.ref_seq, self._seq): if ref_char != DELETED_BASE: ref_index += 1 if ref_char == DELETED_BASE and alt_char == MATCHING_BASE: raise weCallException( "Invalid sequence at ref position {}".format(ref_index)) elif ref_char == DELETED_BASE and alt_char == DELETED_BASE: continue elif alt_char == MATCHING_BASE: current_variant = self.__add_variant_to_set( current_variant, None, variants) continue if ref_char == DELETED_BASE: # insertion var_pos = ref_index var_ref = self._reference[var_pos] var_alt = var_ref + alt_char elif alt_char == DELETED_BASE: # deletion var_pos = ref_index - 1 var_ref = self._reference[var_pos] + ref_char var_alt = self._reference[var_pos] else: # SNP var_pos = ref_index var_ref = ref_char var_alt = alt_char new_variant = Variant(self._reference.chrom, var_pos, var_ref, var_alt) current_variant = self.__add_variant_to_set( current_variant, new_variant, variants) self.__add_variant_to_set(current_variant, None, variants) variants = self.__remove_deletions_from_edges(variants) return variants
def set_genotype_likelihoods(self, sample_name, likelihood_values): def convert_likelihoods(likelihoods, factor): if likelihoods is None or likelihoods == '.': return likelihoods else: return [ None if value in {None, '.'} else value * factor for value in likelihoods ] for key in list(LIKELIHOOD_SCALING_FACTOR.keys()): if self.has_genotype_key(key): converted_values = convert_likelihoods( likelihood_values, LIKELIHOOD_SCALING_FACTOR[key]) self.add_sample_data(sample_name, key, converted_values) return raise weCallException( "Expected one of {} as the likelihood key.".format( list(LIKELIHOOD_SCALING_FACTOR.keys())))
def __get_expected_calls_from_sample_ascii_haplotypes( ascii_haplotypes, reference): calls_per_variant = {} for sample_name, ascii_strings in ascii_haplotypes.items(): calls_for_sample = AsciiWecallRunnerTest.__get_expected_calls_from_haplotypes( ascii_strings, reference) for variant, genotype in calls_for_sample.items(): if variant in calls_per_variant and sample_name in calls_per_variant[variant]: raise weCallException( "Cannot supply multiple genotypes for " "sample_name {} and variant {}.".format( sample_name, variant)) if variant not in calls_per_variant: # ordered dict only to comply with what the actual calls # look like calls_per_variant[variant] = OrderedDict() calls_per_variant[variant][sample_name] = genotype return calls_per_variant
def build_annotated_seq(self, n_fwd, n_rev, mapping_quality, insert_size, read_id, read_flags, cigar_string, read_start, read_mate_start): reference = ReferenceChromosome(self.reference_string, self.pos_from) sequence = Sequence(reference, self.sequence_string.replace(",", ".").upper(), cigar_string) quality = SequenceQuality(self.quality_string) read_sequence = ReadSequence(sequence, quality, mapping_quality, insert_size, read_id, read_flags, read_start, read_mate_start) if n_fwd is not None: return [ReadSequenceWithCoverage(read_sequence, n_fwd, n_rev)] elif self.is_reverse_seq(): return [ReadSequenceWithCoverage(read_sequence, 0, 1)] elif self.is_forward_seq(): return [ReadSequenceWithCoverage(read_sequence, 1, 0)] else: raise weCallException( "Raw sequence: {} is neither forward or reverse".format(self))
def _parse_flag(value): """ Parses a 'flag' info field. If flag is used as a proper flag the value is None and it is assumed that that means True. Missing flag is unclear and hence not parsed. """ if value == UNKNOWN: return None else: if value is None: return True if isinstance(value, bool): return value value = value.upper() if value in {'1', 'YES', 'TRUE'}: return True elif value in {'0', 'NO', 'FALSE'}: return False else: # For strict VCF parsing configure parser to throw on log warnings. # TODO: Work out how to configure logger to do this. logging.warning("Invalid flag {}".format(value)) raise weCallException("Invalid flag {}".format(value))
def __validate_ref_seq(self, ref_seq): if not re.match(r'^[ACGTURYKMSWBDHVN\*]*\Z', ref_seq): raise weCallException( "Illegal character in reference sequence {!r}".format(ref_seq))
def sequence_builder( reference, seq_string, quality_string=None, n_fwd=None, n_rev=None, mapping_quality=HIGH_QUALITY, insert_size=None, read_id=None, read_flags=None, cigar_string=None, read_start=None, read_mate_start=None, ): quality_string = " " * \ len(seq_string) if quality_string is None else quality_string if not all(i is None for i in [n_fwd, n_rev]) and any(i is None for i in [n_fwd, n_rev]): raise weCallException( "Invalid combination of forward and reverse reads: n_fwd = {}, n_rev = {} " .format(n_fwd, n_rev)) if len(seq_string) != reference.length_with_deletions(): raise weCallException( "Sequence has to be of the same length as reference. seq_length {}, ref_length {}" .format(len(seq_string), reference.length_with_deletions())) if len(quality_string) != reference.length_with_deletions(): raise weCallException( "Quality string has to be of the same length as reference.") ref_pos = reference.pos_from current_raw_seq = RawStringSequences() sequences = [] for ref_char, seq_char, qual_char in zip(reference.ref_seq, seq_string, quality_string): seq_position = SequencePosition(ref_char, seq_char, qual_char) if seq_position.is_gap and current_raw_seq.is_ongoing: sequences.append(current_raw_seq) current_raw_seq = RawStringSequences() elif not seq_position.is_gap: current_raw_seq.add_position(seq_position, ref_pos) ref_pos = seq_position.update_ref_pos(ref_pos) if current_raw_seq.is_ongoing: sequences.append(current_raw_seq) annotated_seqs = [] if (len(sequences) % 2 == 0 and all( (sequences[index].is_forward_seq() for index in range(0, len(sequences), 2))) and all( (sequences[index].is_reverse_seq() for index in range(1, len(sequences), 2)))): # sequence of read pairs pairs = list( zip((sequences[index] for index in range(0, len(sequences), 2)), (sequences[index] for index in range(1, len(sequences), 2)))) for fwd, rev in pairs: annotated_seqs.extend( build_annotated_pair(fwd, rev, n_fwd, n_rev, mapping_quality, insert_size, read_id, read_flags, cigar_string, read_start, read_mate_start)) else: # unpaired reads for seq in sequences: annotated_seqs.extend( seq.build_annotated_seq(n_fwd, n_rev, mapping_quality, insert_size, read_id, read_flags, cigar_string, read_start, read_mate_start)) return annotated_seqs
def get_read_depth(self, sample_name): for key in READ_DEPTH_KEYS: if self.has_genotype_key(key): return self.get_field(sample_name, key) raise weCallException( "Expected one of {} as the depth key.".format(READ_DEPTH_KEYS))
def sequence_string(self): if not self.is_forward_seq() and not self.is_reverse_seq(): raise weCallException("Illegal character in sequence {!r}".format( self.__sequence_string)) return self.__sequence_string
def get_chromosome_index(chrom): try: return CHROMOSOME_ORDER[standardise_chromosome(chrom)] except KeyError: raise weCallException("Invalid chromosome {}".format(chrom))
def from_vcf_str(vcf_str, desired_type): try: return desired_type(vcf_str) if vcf_str != "." else None except ValueError: raise weCallException("Cannot cast {} to {!r}".format( vcf_str, desired_type))
def __validate_expected_calls(expected_ascii, expected_stubs): if expected_ascii is None and expected_stubs is None: raise weCallException( "Expected variants have to be provided either in the ascii or variant stub format." )