Ejemplo n.º 1
 def testSampleNameSelectorWithMaf(self):
     input = MafliteInputMutationCreator("testdata/maflite/tiny_maflite.maf.txt")
     first_mut = next(input.createMutations())
     s = SampleNameSelector(first_mut)
     for mut in input.createMutations():
         self.assertEqual("Patient0-Normal-Patient0-Tumor", s.getSampleName(mut))
     self.assertEqual(s.getOutputAnnotationName(), MutUtils.SAMPLE_NAME_ANNOTATION_NAME)
Ejemplo n.º 2
 def testSampleNameSelectorWithVCF(self):
     input = VcfInputMutationCreator("testdata/vcf/example.1row.vcf")
     first_mut = next(input.createMutations())
     s = SampleNameSelector(first_mut)
     expected = ["NA 00001", "NA 00002", "NA 00003"]
     for mut in input.createMutations():
         self.assertIn(s.getSampleName(mut), expected)
     self.assertEqual(s.getAnnotationSource(), "INPUT")
     self.assertEquals(s.getOutputAnnotationName(), "sample_name")
Ejemplo n.º 3
 def testSampleNameSelectorWithVCF(self):
     input = VcfInputMutationCreator("testdata/vcf/example.1row.vcf")
     first_mut = next(input.createMutations())
     s = SampleNameSelector(first_mut)
     expected = ["NA 00001", "NA 00002", "NA 00003"]
     for mut in input.createMutations():
         self.assertIn(s.getSampleName(mut), expected)
     self.assertEqual(s.getAnnotationSource(), "INPUT")
     self.assertEquals(s.getOutputAnnotationName(), "sample_name")
Ejemplo n.º 4
 def testSampleNameSelectorWithMaf(self):
     input = MafliteInputMutationCreator(
     first_mut = next(input.createMutations())
     s = SampleNameSelector(first_mut)
     for mut in input.createMutations():
     self.assertEqual(s.getAnnotationSource(), "OUTPUT")
Ejemplo n.º 5
    def _writeMuts2Tsv(self, muts, path):
        Given a mutation generator, this methods writes a tab separated file for all mutations in the mutation
        generator. In addition, this method computes the appropriate sample name in scenarios where the mutation is
        missing sample name annotation. It also computes a list of all chromosomes and sample names contained within
        the generator.

        :param path: temporary filename
        :param muts: generator object with mutations

        sampleNames = set()
        chroms = set()

        writer = None

        # create a temporary file to write tab-separated file
        tempTsvFile = tempfile.NamedTemporaryFile(dir=path, delete=False)
        self.logger.debug("Creating intermediate tsv file at %s" % tempTsvFile.name)

        mutAttributeNames = []
        sampleNameSelector = SampleNameSelector(self.mutation,

        with open(tempTsvFile.name, 'w') as fptr:
            ctr = 0
            sampleNameAnnotationName = sampleNameSelector.getOutputAnnotationName()
            sampleNameSource = sampleNameSelector.getAnnotationSource()

            for mut in muts:
                if len(mutAttributeNames) == 0:
                    mutAttributeNames = mut.getAttributeNames()

                sampleName = sampleNameSelector.getSampleName(mut)
                if sampleName is not None:
                    if mut.get(sampleNameAnnotationName, None) is None:
                        mut.createAnnotation(sampleNameAnnotationName, sampleName, sampleNameSource)

                # Parse chromosome

                updated_start, updated_ref_allele, updated_alt_allele = MutUtils.retrieveMutCoordinatesForRendering(mut)
                mut.ref_allele = updated_ref_allele
                mut.alt_allele = updated_alt_allele
                mut.start = updated_start

                if ctr == 0:
                    fieldnames2Render = MutUtils.getAllAttributeNames(mut)
                    if sampleNameAnnotationName is not None:
                        fieldnames2Render += [sampleNameAnnotationName]
                    for fieldname in fieldnames2Render:  # fieldnames that start "_" aren't rendered
                        if fieldname.startswith("_"):

                    writer = csv.DictWriter(fptr, fieldnames2Render, extrasaction='ignore', delimiter=self.delimiter,


                ctr += 1
                if (ctr % 1000) == 0:
                    self.logger.info("Wrote " + str(ctr) + " mutations to tsv.")

        sampleNames = list(sampleNames)
        chroms = list(chroms)

        return chroms, sampleNames, tempTsvFile.name
Ejemplo n.º 6
class OnpQueue(object):
    Bookkeeping class to maintain the mutations waiting to be combined
    def __init__(self, mutations):
        Initialize an new queue with a MutationData iterator
        :param mutations: any MutationData producing Iterator
        self.mutations = more_itertools.peekable(mutations)
        self.sns = SampleNameSelector(self.mutations.peek())
        self.queue = collections.defaultdict(list)
        self.indel_queue = []
        self.last = 0
        self.logger = logging.getLogger(__name__)
        self.warned_about_order = False

    def _create_start_position_dict(mutations):
        Create a start_position -> mutation dict
        :param mutations: a collection of MutationData
        :return: a dictionary containing all the input MutationData grouped by start postion
        assert (mutations is not None)
        starts = collections.defaultdict(list)
        for mut in mutations:
            starts[int(mut.start)] += [mut]
        return starts

    def _paths(finished_paths, path_so_far, start, muts):
        """Return all paths from the start position through the mutation graph
        :param finished_paths: completed paths
        :param path_so_far: the accumulated mutation->mutation path so far
        :param start: the start position to travers the muts from
        :param muts: a dictionary in the form {start_position: [Mutation]}
        :return: All paths through adjacent mutations starting with mutations at chromosome position start
        if muts == [] or start not in muts:
            # return reduce(operator.concat, lambda mut: OnpCombiner._paths(path + [mut], mut.end+1, muts), [])
            # path =  map(lambda mut: OnpQueue._paths(path + [mut], int(mut.end)+1, muts), muts[start])
            for mut in muts[start]:
                OnpQueue._paths(finished_paths, path_so_far + [mut],
                                int(mut.end) + 1, muts)
            return finished_paths
            #return reduce(operator.concat, path)

    def _add(self, mutation):
        variant_type = TranscriptProviderUtils.infer_variant_type(
            mutation.ref_allele, mutation.alt_allele)
        # only combine ONPs, not indels
        if not TranscriptProviderUtils.is_xnp(variant_type):

    def _walk_mutation_paths(self, muts):
        Find all paths through adjacent mutations and return those as combined mutations

        Find the first mutations by chromosome position and compute all paths through adjacent mutations reachable from them.
        If there are any nodes that were not reached, choose the first position with unreached nodes and repeat
        :param muts: a list of mutations to walk throught
        :return: a list of new mutations combined from
        unreached = muts
        paths = []
        starts = self._create_start_position_dict(muts)
        while unreached:
            paths += self._paths([], [],
                                 min([int(mut.start) for mut in unreached]),
            reached = [mut for path in paths for mut in path]
            unreached = [mut for mut in muts if mut not in reached]

        paths = [OnpQueue._combine_mutations(path) for path in paths]
        return paths

    def _dump_all(self):
        results = []
        for (sample, muts) in self.queue.iteritems():
            results += self._walk_mutation_paths(muts)

        #add all stored up indels
        results += self.indel_queue or []
        self.indel_queue = []
        results.sort(key=lambda x: (int(x.start), int(x.end)))

        return results

    def _get_all_values(self):
        return [j for i in self.queue.values() for j in i]

    def _is_adjacent_to_any_xnp(self, new_mutation):
        return self._is_adjacent(new_mutation, self._get_all_values())

    def _is_adjacent(self, new_mutation, mutations):
        if mutations:
            ends = [int(x.end) for x in mutations]
            return int(new_mutation.start) <= 1 + max(ends)
            return False

    def _combine_mutations(mutations):
        Merge multiple adjacent mutations into a single new mutation.

        :param mutations: an ordered list of MutationData
        :returns a new MutationData

        :warning: _combine_mutations does not make any attempt to sanity check input mutations
        it will happily combine overlapping and non-adjacent mutations on disparate chromosomes
        if len(mutations) == 0:
            return None
        if len(mutations) == 1:
            return mutations[0]

        # special logic for the attributes
        start = min([mut.start for mut in mutations])
        end = max([mut.end for mut in mutations])
        chr = mutations[0].chr
        ref = "".join([mut.ref_allele for mut in mutations])
        alt = "".join([mut.alt_allele for mut in mutations])
        build = "|".join(set([x.build for x in mutations]))

        #create the new mutation
        newmut = MutationData(chr=chr,

        #add annotations to the mutation
        allAnnotations = set(flatmap(lambda x: x.keys(), mutations))
        annotationNames = allAnnotations - set(
        for annotName in annotationNames:
            annotations = []
            for mut in mutations:
                except KeyError:

            values = sorted(
                (set([x.getValue() for x in annotations if x.getValue()])))
            value = "|".join(values)
            tags = sorted(set(flatmap(lambda x: x.getTags(), annotations)))
            source = annotations[0].getDatasource()
            datatype = annotations[0].getDataType()
            number = annotations[0].getNumber()
            description = annotations[0].getDescription()
        return newmut

    def _combine_with_indels(self, output):
        """add the indels to output and sort by start position"""
        output += self.indel_queue or []
        self.indel_queue = []
        output.sort(key=lambda x: int(x.start))

    def get_combined_mutations(self):
        :return: a generator yielding mutations, adjacent SNPs,DNPs, and ONPs will be merged together.
        # assumes mutations are sorted by start position and then sample
        #if they're not, it won't find DNPs
        last_chr = -1
        last_start = -1
        for mut in self.mutations:
            output = []
            #if we're on a new chromosome, dump all mutations, then add the new one to the queue
            if mut.chr != last_chr:
                output = self._dump_all()
            #if we're at the same start position, add the new mutation to the queue
            elif mut.start == last_start:
            #if we are at a new position on the same chromosome
            elif self._is_adjacent_to_any_xnp(mut):
                #  if we are adjacent/overlapping to one of our existing positions
                #   add the mutation
                if not self.warned_about_order and int(mut.start) < last_start:
                        "Mutations are not sorted by start position, this may cause unexpected behavior or "
                        "increased memory requirements.  It is recommended that your sort any files that you"
                        "using with --infer-onps by position and sample name.")
                    self.warned_about_order = True
            #  if we are not adjacent to any existing queue position,
            #   dump mutations, then add the mutation
                output = self._dump_all()
            last_chr = mut.chr
            last_start = mut.start

            for mut in output:
                yield mut

        #when we're finished, be sure to dump any last mutations
        output = self._dump_all()
        for mut in output:
            yield mut
Ejemplo n.º 7
class OnpQueue(object):
    Bookkeeping class to maintain the mutations waiting to be combined

    def __init__(self, mutations, mutation_data_factory):
        Initialize an new queue with a MutationData iterator

        :param mutations: any MutationData producing Iterator
        :param mutation_data_factory: a MutationDataFactory to be used to produce new mutations for the ONPs
        self.mutations = more_itertools.peekable(mutations)
        self.sns = SampleNameSelector(self.mutations.peek())
        self.queue = collections.defaultdict(list)
        self.indel_queue = []
        self.last = 0
        self.logger = logging.getLogger(__name__)
        self.warned_about_order = False
        self._mutation_data_factory = mutation_data_factory

    def _create_start_position_dict(mutations):
        Create a start_position -> mutation dict
        :param mutations: a collection of MutationData
        :return: a dictionary containing all the input MutationData grouped by start postion
        assert mutations is not None
        starts = collections.defaultdict(list)
        for mut in mutations:
            starts[int(mut.start)] += [mut]
        return starts

    def _paths(finished_paths, path_so_far, start, muts):
        """Return all paths from the start position through the mutation graph
        :param finished_paths: completed paths
        :param path_so_far: the accumulated mutation->mutation path so far
        :param start: the start position to travers the muts from
        :param muts: a dictionary in the form {start_position: [Mutation]}
        :return: All paths through adjacent mutations starting with mutations at chromosome position start
        if muts == [] or start not in muts:

            # No mutations available to continue this chain
            # return reduce(operator.concat, lambda mut: OnpCombiner._paths(path + [mut], mut.end+1, muts), [])
            # path =  map(lambda mut: OnpQueue._paths(path + [mut], int(mut.end)+1, muts), muts[start])
            for mut in muts[start]:
                if len(path_so_far) > 0 and not PhasingUtils.is_in_phase(path_so_far[-1], mut):

                    # Next mutation not in phase, so stop this path here.
                    OnpQueue._paths(finished_paths, path_so_far + [mut], int(mut.end) + 1, muts)
            return finished_paths
            # return reduce(operator.concat, path)

    def _add(self, mutation):
        variant_type = TranscriptProviderUtils.infer_variant_type(mutation.ref_allele, mutation.alt_allele)
        # only combine ONPs, not indels
        if not TranscriptProviderUtils.is_xnp(variant_type):

    def _walk_mutation_paths(self, muts):
        Find all paths through adjacent mutations and return those as combined mutations

        Find the first mutations by chromosome position and compute all paths through adjacent mutations reachable from them.
        If there are any nodes that were not reached, choose the first position with unreached nodes and repeat
        :param muts: a list of mutations to walk throught
        :return: a list of new mutations combined from
        unreached = muts
        paths = []
        starts = self._create_start_position_dict(muts)
        while unreached:
            paths += self._paths([], [], min([int(mut.start) for mut in unreached]), starts)
            reached = [mut for path in paths for mut in path]
            unreached = [mut for mut in muts if mut not in reached]

        paths = [OnpQueue._combine_mutations(path, self._mutation_data_factory) for path in paths]
        return paths

    def _dump_all(self):
        results = []
        for (sample, muts) in self.queue.iteritems():
            results += self._walk_mutation_paths(muts)

        # add all stored up indels
        results += self.indel_queue or []
        self.indel_queue = []
        results.sort(key=lambda x: (int(x.start), int(x.end)))

        return results

    def _get_all_values(self):
        return [j for i in self.queue.values() for j in i]

    def _is_adjacent_to_any_xnp(self, new_mutation):
        return self._is_adjacent(new_mutation, self._get_all_values())

    def _is_adjacent(self, new_mutation, mutations):
        if mutations:
            ends = [int(x.end) for x in mutations]
            return int(new_mutation.start) <= 1 + max(ends)
            return False

    def _combine_mutations(mutations, mutation_data_factory):
        Merge multiple adjacent mutations into a single new mutation.

        :param mutations: an ordered list of MutationData
        :returns a new MutationData

        :warning: _combine_mutations does not make any attempt to sanity check input mutations
        it will happily combine overlapping and non-adjacent mutations on disparate chromosomes
        if len(mutations) == 0:
            return None
        if len(mutations) == 1:
            return mutations[0]

        # special logic for the attributes
        start = min([mut.start for mut in mutations])
        end = max([mut.end for mut in mutations])
        chr = mutations[0].chr
        ref = "".join([mut.ref_allele for mut in mutations])
        alt = "".join([mut.alt_allele for mut in mutations])
        build = "|".join(set([x.build for x in mutations]))

        # create the new mutation
        newmut = mutation_data_factory.create(
            chr=chr, start=start, end=end, ref_allele=ref, alt_allele=alt, build=build

        # add annotations to the mutation
        allAnnotations = set(flatmap(lambda x: x.keys(), mutations))
        annotationNames = allAnnotations - set(mutations[0].getAttributeNames())
        for annotName in annotationNames:
            annotations = []
            for mut in mutations:
                except KeyError:

            values = [x.getValue() for x in annotations]
            if len(set(values)) == 1:
                value = values[0]  # if all annotations are identical then don't pipe separate them
                value = "|".join(values)

            tags = sorted(set(flatmap(lambda x: x.getTags(), annotations)))
            source = annotations[0].getDatasource()
            datatype = annotations[0].getDataType()
            number = annotations[0].getNumber()
            description = annotations[0].getDescription()
        return newmut

    def _combine_with_indels(self, output):
        """add the indels to output and sort by start position"""
        output += self.indel_queue or []
        self.indel_queue = []
        output.sort(key=lambda x: int(x.start))

    def get_combined_mutations(self):
        :return: a generator yielding mutations, adjacent SNPs,DNPs, and ONPs will be merged together.
        # assumes mutations are sorted by start position and then sample
        # if they're not, it won't find DNPs
        last_chr = -1
        last_start = -1
        for mut in self.mutations:
            output = []
            # if we're on a new chromosome, dump all mutations, then add the new one to the queue
            if mut.chr != last_chr:
                output = self._dump_all()
            # if we're at the same start position, add the new mutation to the queue
            elif mut.start == last_start:
            # if we are at a new position on the same chromosome
            elif self._is_adjacent_to_any_xnp(mut):
                #  if we are adjacent/overlapping to one of our existing positions
                #   add the mutation
                if not self.warned_about_order and int(mut.start) < last_start:
                        "Mutations are not sorted by start position, this may cause unexpected behavior or "
                        "increased memory requirements.  It is recommended that your sort any files that you"
                        "using with --infer-onps by position and sample name."
                    self.warned_about_order = True
            #  if we are not adjacent to any existing queue position,
            #   dump mutations, then add the mutation
                output = self._dump_all()
            last_chr = mut.chr
            last_start = mut.start

            for mut in output:
                yield mut

        # when we're finished, be sure to dump any last mutations
        output = self._dump_all()
        for mut in output:
            yield mut