def __init__(self, nodes, constant_modifications=None, variable_modifications=None):
     self.nodes = MassHeap(Node(s.mass, graph=self) for s in nodes)
     self.edges = defaultdict(list)
     self.constant_modifications = constant_modifications or []
     self.variable_modifications = variable_modifications or []
     self.variable_modifications += [modification.Modification("HexNAc").rule]
     self.parts = MassHeap(generate_component_set(
         self.constant_modifications,
         self.variable_modifications))
     self.long_parts = {1: self.parts}
     self.node_map = {}
def generate_random_glycopeptides(target_mass, ppm_error=10e-6, count=20, constant_modifications=None,
                                  variable_modifications=None, glycans=None, min_length=0, cleavage_start=None,
                                  cleavage_end=None, max_missed_cleavages=1, max_glycosylations=2):
    '''
    Given a target mass value and a tolerance threshold around it, create a set of random glycopeptides
    that satisfy the mass requirements.
    '''
    if glycans is None:
        glycans = mammalian_glycans
    if constant_modifications is None:
        constant_modifications = []
    else:
        constant_modifications = copy.deepcopy(constant_modifications)
    if variable_modifications is None:
        variable_modifications = []
    else:
        variable_modifications = copy.deepcopy(variable_modifications)

    if cleavage_start is None or len(cleavage_start) == 0:
        cleavage_start = [""]

    if cleavage_end is None or len(cleavage_end) == 0:
        cleavage_end = [""]

    cleavage_pattern = Protease(cleavage_start, cleavage_end)

    variable_modifications = [
        mod for mod in variable_modifications if mod.name != "HexNAc"]
    constant_modifications = [
        mod for mod in constant_modifications if mod.name != "HexNAc"]

    components = MassHeap(map(lambda x: GrowingSequence(x, cleavage_pattern), generate_component_set(
        constant_modifications, variable_modifications)))
    sequons = MassHeap(
        map(lambda x: GrowingSequence(x, cleavage_pattern),
            itertools.chain.from_iterable(
                map(lambda x: ("{0}({1}){2}".format(x[0], g.as_modification().serialize(), x[1:])
                               for g in glycans),
                    generate_n_linked_sequons()
                    )
            )
            )
    )

    loc_fabs = fabs
    water = Composition("H2O").mass

    def reset_target_mass():
        return (water + target_mass) - min(p.mass for p in candidate.pad())
    solutions = set()
    max_iter = count * 10000
    iter_count = 0
    candidate = GrowingSequence("", cleavage_pattern)
    mass_to_meet = reset_target_mass()
    while(len(solutions) < count and iter_count < max_iter):
        can_glycosylate = (len(candidate) > min_length / 3) and \
            (has_glycan(candidate) < max_glycosylations) and \
            (random.random() > .7)
        options = list(components.get_lower_than(mass_to_meet))

        if(can_glycosylate):
            glycosylated_options = list(sequons.get_lower_than(mass_to_meet))
            options += glycosylated_options

        #logger.debug("%s options for extension, mass to meet: %s, %s" % (len(options), mass_to_meet, str(candidate)))
        next_part = random.choice(options)
        candidate.extend(next_part)
        mass_to_meet -= (next_part.mass - water)
        # print(str(candidate), candidate.missed_cleavages, len(candidate))
        # Reset, too many missed cleavages?
        if candidate.missed_cleavages > max_missed_cleavages:
            #print("Too many missed cleavages: %s, Reset!" % candidate.missed_cleavages)
            candidate = GrowingSequence("", cleavage_pattern)
            mass_to_meet = reset_target_mass()

        for padded_sequence in candidate.pad():
            # Only consider glycosylated sequences
            if has_glycan(candidate) < 1:
                break

            # Only consider longer sequences
            if(len(padded_sequence) < min_length):
                continue

            error = loc_fabs(
                (target_mass - padded_sequence.mass) / float(target_mass))
            # logger.debug("%s, %s, %s" %
            #              (padded_sequence, padded_sequence.mass, error))
            # Accept?
            if error <= ppm_error:
                #logger.debug("Accepting %s %s" %
                #             (padded_sequence, padded_sequence.mass))
                solutions.add(str(padded_sequence))

        # Reset, too big?
        if mass_to_meet < components[0].mass:
            candidate = GrowingSequence("", cleavage_pattern)
            mass_to_meet = reset_target_mass()

        iter_count += 1

    return solutions
class Graph(object):
    def __init__(self, nodes, constant_modifications=None, variable_modifications=None):
        self.nodes = MassHeap(Node(s.mass, graph=self) for s in nodes)
        self.edges = defaultdict(list)
        self.constant_modifications = constant_modifications or []
        self.variable_modifications = variable_modifications or []
        self.variable_modifications += [modification.Modification("HexNAc").rule]
        self.parts = MassHeap(generate_component_set(
            self.constant_modifications,
            self.variable_modifications))
        self.long_parts = {1: self.parts}
        self.node_map = {}

    def __iter__(self):
        return iter(self.nodes)

    def process(self, node, parts=None):
        if parts is None:
            parts = self.parts
        for extent in self.nodes.get_higher_than(node.mass):
            gap_mass = extent.mass - node.mass
            for part in parts.get_lower_than(gap_mass + precursor_mass_shift + 1):
                if fabs(ppm_error(gap_mass + y_mass_shift, part.mass + y_mass_shift)) <= 2e-5:
                    self.edges[frozenset((node, extent))].append(Edge(
                            node, extent, node.mass, link_sequence=part)
                    )

    def process_all(self, length=1):
        parts = self.long_parts.get(length, None)
        if parts is None:
            self.build_unordered_sequences(length)
            parts = self.long_parts[length]
        for node in self:
            self.process(node, parts)
        self.build_node_map()

    def build_node_map(self):
        self.node_map = defaultdict(list)
        for node in self:
            for pair in [pair for pair in self.edges if node in pair]:
                if node == min(pair, key=masser):
                    self.node_map[node].extend(self.edges[pair])

    def roots(self):
        roots = set(self.node_map)
        for node, edges in self.node_map.items():
            for edge in edges:
                roots.discard(edge.to_terminus)
            if len(edges) == 0:
                roots.discard(node)
        return list(roots)

    def build_unordered_sequences(self, n=2):
        self.long_parts[n] = MassHeap(list(unordered_combinations(self.parts, n)))

    def get_sequence(self, node):
        for path in self.traverse(node):
            yield (node, SequenceCollection(map(lambda x: x.link_sequence, path)), path[-1].to_terminus)

    def traverse(self, node):
        if len(self.node_map[node]) == 0:
            yield []
        else:
            for edge in self.node_map[node]:
                for path in self.traverse(edge.to_terminus):
                    yield [edge] + path

    def all_paths(self):
        for root in self.roots():
            for path in self.traverse(root):
                yield (sum(map(len, path)), path)

    def all_sequences(self):
        for root in self.roots():
            for seq in self.get_sequence(root):
                yield (len(seq[1]), seq)

    def identify_node(self, node, parts=None):
        if parts is None:
            parts = self.parts
        for part in parts:
            if fabs(ppm_error(node.mass, part.mass + y_mass_shift)) <= 2e-5:
                node.composition.append(part)
                node.kind.append('y')
            elif fabs(ppm_error(node.mass, part.mass + b_mass_shift)) <= 2e-5:
                node.composition.append(part)
                node.kind.append('b')
        return zip(node.composition, node.kind)
def sequence_spectra(ms_spectrum, drop_mass=0, constant_modifications=None, variable_modifications=None,
                     max_running_gaps=1, max_total_gaps=2):
    constant_modifications = constant_modifications or []
    variable_modifications = variable_modifications or []
    variable_modifications += [modification.Modification("HexNAc").rule]
    precursor_mass = ms_spectrum.neutral_mass - drop_mass
    logger.info("Precursor Mass: %f", precursor_mass)
    previous_sequences = SqliteDiskQueue()
    parts = map(SimpleFragment,
                generate_component_set(constant_modifications,
                                       variable_modifications))
    tandem = MassHeap(ms_spectrum.tandem_data)

    # Get starting component
    match = False
    for part in parts:
        for msms in tandem:
            if (fabs(ppm_error(msms.neutral_mass, part.mass + y_mass_shift)) < 2e-5):
                previous_sequences.append(SequenceRecord(part, kind='y'))
                match = True
            if (fabs(ppm_error(msms.neutral_mass, part.mass + b_mass_shift)) < 2e-5):
                previous_sequences.append(SequenceRecord(part, kind='b'))
                match = True
    if not match:
        for part in parts:
            previous_sequences.append(SequenceRecord(part, 1, 1))
    next_sequences = SqliteDiskQueue()
    solutions = deque(maxlen=4 * max_total_gaps)
    min_part = min(parts, key=lambda x: x.mass).mass
    max_part = max(parts, key=lambda x: x.mass).mass
    while len(previous_sequences) > 0:
        for seq in previous_sequences:
            match = []
            lower = (seq.mass + min_part)
            upper = seq.mass + max_part
            lower_threshold = lower - (y_mass_shift + lower * 2e-5)
            upper_threshold = upper + (y_mass_shift + upper * 2e-5)
            for msms in reversed(list(tandem.get_higher_than(lower_threshold))):
                if msms.neutral_mass > upper_threshold:
                    break
                for part in parts:
                    mass_query = part.mass + y_mass_shift + seq.mass
                    if seq.kind != 'b' and (fabs(ppm_error(msms.neutral_mass, mass_query)) < 2e-5):
                        ext = seq.extend(part, False)
                        ext.matches.append(msms)
                        ext.kind = 'y'
                        next_sequences.append(ext)
                        match.append(msms)
                        # logger.info("Match on %r -> %r", ext, msms)
                    mass_query += -y_mass_shift + b_mass_shift
                    if seq.kind != 'y' and (fabs(ppm_error(msms.neutral_mass, mass_query)) < 2e-5):
                        ext = seq.extend(part, False)
                        ext.kind = 'b'
                        next_sequences.append(ext)
                        match.append(msms)
                        # logger.info("Match on %r -> %r", ext, msms)
            if len(match) == 0:
                # logger.info("No match on %r", seq)
                if seq.current_gaps + 1 <= max_running_gaps and seq.total_gaps + 1 <= max_total_gaps:
                    for part in parts:
                        next_sequences.append(seq.extend(part, True))
        logger.info("Round over, %d candidates", len(next_sequences))
        if len(next_sequences) == 0:
            return ResultsGroup([seq for round in solutions for seq in round], parts)
        previous_sequences = next_sequences
        solutions.append(seq for seq in next_sequences if len(seq.matches) > 0)
        next_sequences = SqliteDiskQueue()