Exemple #1
0
def read_smiles(file_names):
    """
    Takes in a txt file, reads each line and stores the result in a list.
    Returns another list when using_lists
    :param file_name: path to smiles file
    :param using_lists: flag to create a list that keeps each list the smiles string is in.
    :return: a tuple of (smiles strings, list_memberships)
    """

    list_membership = []
    list_counter = 0
    smiles_set = []

    for file in file_names:
        with open(file, 'r') as input:
            for line in input:
                if not line.strip():
                    list_counter += 1
                else:
                    smiles_set.append(line)
                    list_membership.append(list_counter)

        list_counter += 1

    verbose.info("data_input", list_counter, len(smiles_set))

    return (smiles_set, list_membership)
    def _find_representative_structures(self, rep_paths):
        """
        Find the path structures which appear with a high enough frequency.

        Ensures there are no duplicates in the dictionary of path structures so that the frequency count is accurate
        :param rep_paths: a list of the paths which are representative
        :return: a list of path structures which are representative sorted into order of frequency, highest to lowest
        """
        #start = datetime.now()
        #print rep_paths

        rep_structures = {}
        verbose.info("rep_struct_start")
        for path in rep_paths:
            for molecule in self.molecules:
                if path in molecule.paths:
                    #print len(molecule.paths[path])
                    path_vertices_groups = molecule.paths[
                        path]  # Lists of path vertices
                    for vertices in path_vertices_groups:
                        # Test if the structure has already been encountered
                        path_structure = ps.PathStructure(
                            path, vertices, molecule)
                        #self._check_structure_duplicates(path_structure, molecule, path_structure.struct_parent_map)
                        path_structure.check_if_duplicate(
                            molecule, path_structure.struct_parent_map,
                            self.path_structures, self.multiple_dict,
                            self.nx_structures)

        for structure in self.path_structures:
            relative_frequency = len(
                self.path_structures[structure].keys()) / float(
                    len(self.molecules))
            if float(relative_frequency
                     ) >= self.threshold and self.check_list_frequency(
                         self.path_structures[structure].keys()):
                # Store relative frequency of each structure (induced by path in rep_paths) as value in dictionary
                rep_structures[structure] = relative_frequency
        # Sort dictionary based on frequency highest to lowest and return the path structures only

        if self.not_random:
            temp = OrderedDict(
                sorted(rep_structures.items(),
                       key=lambda x: (x[1], x[0].ID),
                       reverse=True))
        else:
            temp = OrderedDict(
                sorted(rep_structures.items(),
                       key=lambda x: x[1],
                       reverse=True))
        verbose.info("rep_struct_finish", temp)
        return temp
def assess_representativeness(candidate, cs):

    cs.molecules.append(candidate)

    verbose.info("start_paths")
    paths = cs.find_graphs_paths()
    verbose.info("finish_paths")

    rep_structs = cs.find_all_representative_structures(paths, name=cs.name)
    score = score_structures(rep_structs)
    cs.clear_all_structures()
    cs.molecules.pop()
    return score
    def find_graphs_paths(self):
        """
        For each SMILES string a molecule object is created and all of its paths found

        :param smiles_set: a list of SMILES strings
        :return: a dictionary containing all of the paths as strings with their length as value
        """
        paths = []
        for molecule in self.molecules:
            molecule.find_all_paths(paths)

        verbose.info("show_path_info", paths)

        return paths
    def find_characteristic_substructure(self, paths):
        """
        Find the characteristic substructure for a set of molecules

        Calls the representative paths method for each of the lengths between the start length and the end
        Creates the CS with the most frequent path structure
        Each of the subsequent path structures is added to the CS in order of frequency
        Swaps the vertices in the molecules which map to the CS with the CS vertices

        :return: a molecule object that is the characteristic substructure of the list of molecules
        """
        self.clear_all_structures()

        if not paths:
            return None

        self.length_start = self.get_starting_length(paths)
        length = self.length_start
        verbose.info("starting_length", length)
        while length >= self.length_end:
            verbose.info("current_length", length, self.name)
            representative_paths = self._find_representative_paths(
                paths, length)
            sorted_dictionary = self._find_representative_structures(
                representative_paths)
            # After considering paths of this length test to see if there are representative substructures
            # If there are no rep structures then decrease stepwise, if there is increase the step size
            #print [struct.label for struct in sorted_dictionary.keys()]
            for structure in sorted_dictionary.keys():
                self._add_structure_to_characteristic(structure)

            self.clear_old_structures()

            # The step size only increases if the characteristic substructure has been started
            if self.cs_begun:
                length -= self.step
            else:
                length -= 1

        return self.characteristic_substructure
    def _add_structure_to_characteristic(self, structure):
        if not self.cs_begun:
            self.add_CS_base(structure)
            verbose.info("add_to_CS", structure.label)

        elif self.can_add_multiple_times(structure):
            possible_locations = self.find_multiple_addable_locations(
                structure)
            if possible_locations:
                molecule_map = {}
                verbose.info("add_to_CS", structure.label,
                             len(possible_locations))
                for location in possible_locations:
                    molecule_map.update(self.merge_to_CS(location, structure))
                self._add_cs_locations(structure, molecule_map)

        else:
            possible_locations = []
            #Iterates through all molecules this path struct is in.
            for molecule in self.path_structures[structure]:
                structure_mapping = self.path_structures[structure][molecule]
                self.find_addable_location(structure, molecule,
                                           possible_locations,
                                           structure_mapping)
            if possible_locations:
                frequency = self.get_location_frequency(possible_locations)
                verbose.info("add_to_CS", structure.label)
                molecule_map = self.merge_to_CS(
                    self.get_k_most_frequent_locations(frequency[0],
                                                       frequency[1], 1)[0],
                    structure)

                self._add_cs_locations(structure, molecule_map)
    def _find_representative_paths(self, paths, length):
        """
        Find the paths which occur with a high enough frequency
        
        :param length: an integer which sets the length of the paths that should be considered
        :return: a list of the paths as strings which are representative 
        """
        verbose.info("rep_path_start")
        representative_paths = []
        #print paths
        if (self.using_lists):
            for path in paths[length]:
                # Check all the paths which are of the chosen length
                counter = 0
                list_membership = [False] * self.list_number
                for molecule in self.molecules:
                    # Search the tuples for each path which consists of path string and vertices present in path
                    if path in molecule.paths:
                        counter += 1
                        list_membership[molecule.list_member] = True

                if float(counter) / len(self.molecules) >= self.threshold and \
                        float(list_membership.count(True)) / self.list_number >= self.list_threshold:

                    representative_paths.append(path)

        else:
            for path in paths[length]:
                # Check all the paths which are of the chosen length
                counter = 0
                for molecule in self.molecules:
                    # Search the tuples for each path which consists of path string and vertices present in path
                    if path in molecule.paths:
                        counter += 1

                if float(counter) / len(self.molecules) >= self.threshold:
                    representative_paths.append(path)

        verbose.info("rep_path_finish", len(representative_paths))
        return representative_paths
    def __init__(self, **kwargs):
        # The initial parameters for the algorithm
        self.length_start = None
        self.length_end = 5
        self.step = 4
        self.threshold = 0.8
        self.isomorphism_factor = 0.8
        self.list_threshold = 0.8
        self.list_number = -1
        self.using_lists = False
        self.not_random = False
        self.name = "CS"

        for key, value in kwargs.iteritems():
            if hasattr(self, key):
                if value:
                    setattr(self, key, value)
                    verbose.info("show_setting", key, value)

        # The structure which has been created through the combination of representative path structures
        self.characteristic_substructure = m.Molecule(
            'Characteristic Substructure')
        # Indicate if the characteristic substructure contains a path structure yet
        self.cs_begun = False
        # List holding structures which have been added to the characteristic substructure
        self.cs_structures = []
        # Dictionary holding the locations of the molecules which map to the characteristic substructure
        # {molecule: {molecule vertex: cs vertex}}
        self.cs_locations = {}
        # All the given molecules
        self.molecules = []
        # Dictionary of path structures with the isomorphic molecules and mapping vertices from structure to molecule
        # {path structure: {molecule: {structure vertex: molecule vertex}}}
        self.path_structures = {}
        # Dictionary of structures that appear multiple times in molecules and list of molecule vertices that map them
        # {single structure: {molecule: {structure vertex: [molecule vertices]}}}
        self.multiple_dict = {}

        self.nx_structures = {}
    def find_all_representative_structures(self, paths, name="CS"):
        """
        Creates a list of all the structure of different lengths which are representative sorted in terms of frequency

        :return: list of structures which appear frequently in molecules
        """
        #print paths

        self.clear_all_structures()
        all_structures = {}

        if not paths:
            return None

        length = self.get_starting_length(paths)
        verbose.info("starting_struct_length", length)
        while length >= self.length_end:
            verbose.info("current_length", length, name)
            representative_paths = self._find_representative_paths(
                paths, length)
            #print representative_paths
            sorted_dictionary = self._find_representative_structures(
                representative_paths)
            #print sorted_dictionary
            # After considering paths of this length test to see if there are representative substructures
            # If there are no rep structures then decrease stepwise, if there is increase the step size
            if sorted_dictionary:
                all_structures.update(sorted_dictionary)

            self.clear_old_structures()
            # To get the structures of all lengths the step does not alter
            length -= 1

        representative_structures = OrderedDict(
            sorted(all_structures.items(), key=lambda x: x[1], reverse=True))
        return representative_structures
def main():
    """
    Creates the algorithm object with the correct parameters and calls methods based on the command line arguments.

    Uses the argparse module to parse the command line arguments that are given
    The file name for the SMILES file is mandatory, the flags and threshold specification are optional
    :return: None
    """

    global output_directory
    #Argument Parsing
    args = parse_arguments()

    #Configuring options
    if args.verbose:
        verbose.initialise()

    if args.output_name:
        output_directory += "/" + args.output_name
    else:
        file_path, extension = os.path.splitext(args.smiles_files[0])
        output_directory += "/" + os.path.basename(file_path)

    if args.which == "rm":
        rep_mol.find_most_representative_molecule(args, output_directory)
        return

    cs = CSAlgorithm(**vars(args))

    smiles_set, list_memberships = io.read_smiles(args.smiles_files)
    cs.list_number = len(set(list_memberships))

    if cs.list_number > 1:
        cs.using_lists = True

    verbose.info("total_time_start")
    cs.add_molecules(smiles_set, list_memberships)

    verbose.info("start_paths")
    paths = cs.find_graphs_paths()
    verbose.info("finish_paths")

    if args.representative_struct:
        all_structures = cs.find_all_representative_structures(paths)
        print 'Representative Structures:'
        if all_structures:
            io.check_dir(output_directory)
            structure_frequency_list = [
                molecule.to_smiles() + " " + str(all_structures[molecule])
                for molecule in all_structures
            ]
            for structure in structure_frequency_list:
                print structure
            io.write_representative_structure(
                structure_frequency_list,
                output_directory + '/representative_structures.txt')
        else:
            print "None found."

    c_structure = cs.find_characteristic_substructure(paths)

    #Display the structures in the command line
    print 'Characteristic Substructure:'
    if c_structure:
        smiles = c_structure.to_smiles()
        print smiles

        io.check_dir(output_directory)
        io.write_to_file(smiles, output_directory + '/cs.txt')

        if args.image_of_CS:
            import src.drawing.draw_molecule as draw
            draw.draw_smiles(smiles, output_directory + "/cs.png")

        if args.fragmentation_pattern:
            import src.fragmentation.fragmentation_matcher as frag
            import src.drawing.plot_spectra as plot
            result = frag.call_cfm_peak_annotation(smiles,
                                                   args.fragmentation_pattern,
                                                   "CS")
            io.write_to_file(result, output_directory + '/cfm_output.txt')
            print result
            cfm_dict = cfm_parse.parse_cfm(
                io.read_file(output_directory + '/cfm_output.txt'))
            spec_dict = io.read_spectra(args.fragmentation_pattern)

            if args.cfm_min_score:
                plot.plot_cfm_data(cfm_dict, spec_dict, output_directory,
                                   args.cfm_min_score)
            else:
                plot.plot_cfm_data(cfm_dict, spec_dict, output_directory)
    else:
        print "None found."
    verbose.info("total_time_finish")
def find_most_representative_molecule(args, output_dir):

    if len(args.smiles_files) % 2 != 0:
        print "Number of input smiles files NEED to be an even number"
        print "Given:", args.smiles_files
        print "Number:", len(args.smiles_files)
        return

    file_path, extension = os.path.splitext(args.molecule_check_list)
    output_directory = os.path.dirname(output_dir) + "/" + os.path.basename(
        file_path)

    verbose.info("total_time_start")
    set_length = len(args.smiles_files) / 2

    cs_A = setup_CS("CS A", args.smiles_files[:set_length], args)
    cs_B = setup_CS("CS B", args.smiles_files[set_length:], args)
    candidate_smiles = io.read_smiles([args.molecule_check_list])[0]

    best_score = -1
    best_candidate = None
    best_smiles = None
    best_score_counter = 0

    for smiles in candidate_smiles:
        smiles = smiles.rstrip()
        candidate = Parser().parse_smiles(smiles)
        candidate.set_list_membership(0)
        score_A = assess_representativeness(candidate, cs_A)
        score_B = assess_representativeness(candidate, cs_B)

        score = calculate_score(score_A, score_B)
        print smiles, score
        if not best_candidate or score > best_score:
            best_candidate = candidate
            best_score = score
            best_smiles = smiles

    if best_candidate:
        smiles_A = get_CS(cs_A, best_candidate)
        smiles_B = get_CS(cs_B, best_candidate)
        results = "Best candidate: " + best_smiles
        results += "\nScore: " + str(best_score)
        results += "\n" + cs_A.name + ":" + smiles_A
        results += "\n" + cs_B.name + ":" + smiles_B

        print "\n" + results
        io.check_dir(output_directory)
        io.write_to_file(results,
                         output_directory + '/representative_molecule.txt')

        if args.images_of_molecules:
            import src.drawing.draw_molecule as draw
            draw.draw_smiles(best_smiles,
                             output_directory + "/representative_mol.png")
            draw.draw_smiles(smiles_A, output_directory + "/cs_A.png")
            draw.draw_smiles(smiles_B, output_directory + "/cs_B.png")
    else:
        print "No candidate found."

    verbose.info("total_time_finish")