Example #1
0
    def writeInParamsToFile(paramsDict, path):
        fileName = path + "in_params.txt"

        # If the required directories have not already been created
        if not os.path.exists(os.path.dirname(path)):
            # Create the directories
            try:
                os.makedirs(os.path.dirname(path))
            except os.error:
                print("Error: " +
                      ErrorCodes.getErrDescription(ErrorCodes.CANNOT_CREATE_DIRECTORY) +
                      ": Path - " + path)

                raise GenonetsError(ErrorCodes.CANNOT_CREATE_DIRECTORY,
                                    "Path - " + path)

        # Open the file
        try:
            with open(fileName, "w") as outFile:
                # For each input parameter,
                for param in paramsDict.keys():
                    outFile.write(param + ": " + paramsDict[param] + "\n")
        except Exception:
            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.CANNOT_WRITE_TO_FILE) +
                  ": Path - " + path)

            raise GenonetsError(ErrorCodes.CANNOT_WRITE_TO_FILE)
Example #2
0
    def analyze(self, genotype_sets=Gc.ALL, analyses=Gc.ALL, parallel=False):
        """
        Performs all analyses provided in the list of analysis types, on the given genotype sets.

        This method can only be used if `create` has already been called on the same `Genonets`
        object.

        :param genotype_sets: List of names of the genotype sets for which the genotype
                            networks should be created. If a value is not explicitly
                            specified for this parameter, genotype networks are
                            constructed for all genotype sets available in the parsed
                            data.
        :param analyses: List of analysis type constants. These constants are defined in the class
                         `genonets.genonets_constants.AnalysisConstants`. If the value for this
                         parameter is not explicitly set, all available analyses are performed.
        :param parallel: Flag to indicate whether or not parallel processing should
                         be used.
        :return: No return value.
        """

        if self.VERBOSE:
            sys.stdout.write("\nPerforming analyses:")

        # If all genotype_sets should be considered,
        if genotype_sets == Gc.ALL:
            # Get a list of all genotype_sets
            genotype_sets = self.genotype_sets()

        # If a single string is received, convert it into an iterable
        genotype_sets = [genotype_sets] if type(genotype_sets) == str else genotype_sets

        # If overlap in one of the requested analyses, there need to be at
        # at least two genotype_sets in the dataset
        if analyses == Gc.ALL or Ac.OVERLAP in analyses:
            if len(genotype_sets) < 2:
                print("Error: " +
                      ErrorCodes.getErrDescription(ErrorCodes.NOT_ENOUGH_REPS_OLAP) +
                      ": Tau=" + str(self.cmdArgs.tau))

                raise GenonetsError(
                    ErrorCodes.NOT_ENOUGH_REPS_OLAP,
                    "Tau=" + str(self.cmdArgs.tau))

        # If multiprocessing should be used,
        if parallel:
            # Perform all analyses in parallel; overlap will be ignored.
            self._analyze_networks_parallel(genotype_sets, analyses)

            if analyses == Gc.ALL or Ac.OVERLAP in analyses:
                # Reset analysis handler to make sure it references
                # the updated dicts
                del self.analyzer
                self.analyzer = AnalysisHandler(self)

                # Use serial processing to perform overlap analysis
                self._analyze_networks(genotype_sets, [Ac.OVERLAP])
        else:
            # Perform all analyses using serial processing
            self._analyze_networks(genotype_sets, analyses)
Example #3
0
    def __init__(self, arguments):
        # Molecule type: RNA, DNA, Protein, etc.
        self.moleculeType = arguments.alphabetType

        # 'Use reverse complements' flag
        self.use_reverse_complements = True if arguments.use_reverse_complements else False

        # Report exception if 'use_reverse_complements' has been passed as an argument with
        # alphabet type other than DNA
        if self.use_reverse_complements and self.moleculeType != "DNA":
            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.RC_ALPHABET_MISMATCH))

            raise GenonetsError(ErrorCodes.RC_ALPHABET_MISMATCH)

        # Flag to indicate whether shift mutations should
        # be considered
        if arguments.includeIndels.lower() == "true":
            self.useIndels = True
        else:
            self.useIndels = False

        # Path to the input file
        self.inFilePath = arguments.inFilePath

        # Lower bound on fitness values to be used.
        self.tau = arguments.tau

        # Path to the output folder
        self.outPath = arguments.outPath

        # Make sure the path ends with "/", since this is needed
        # in the file writing routines
        if not self.outPath.endswith("/"):
            self.outPath += "/"

        # Maximum number of parallel processes to be used
        self.num_procs = arguments.num_procs

        # Verbose flag
        self.verbose = True if arguments.verbose else False

        # Create a dictionary of parameters
        paramsDict = {
            "alphabetType": self.moleculeType,
            "includeIndels": str(self.useIndels),
            "inFilePath": self.inFilePath,
            "tau": str(self.tau),
            "outPath": self.outPath,
            "useReverseComplements": str(self.use_reverse_complements),
            "num_procs": str(self.num_procs),
            "verbose": str(self.verbose)
        }

        # Print the parsed parameter values
        self.printInParams(paramsDict)

        # Write input parameters to file
        Writer.writeInParamsToFile(paramsDict, self.outPath)
Example #4
0
    def dict_reader_for_file(file_name):
        # Open file
        try:
            data_file = open(file_name, 'rU')
        except Exception as e:
            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.UNKNOWN_PARSING_ERROR))

            raise GenonetsError(ErrorCodes.UNKNOWN_PARSING_ERROR)

        # Read the file into a dictionary
        reader = csv.DictReader(data_file, delimiter="\t")

        return reader, data_file
Example #5
0
    def verify_genotype(genotype, genotype_length, alphabet_type, line_number):
        # Verify the length
        if len(genotype) != genotype_length:
            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.INCONSISTENT_SEQ_LEN) +
                  ": Line No. " + line_number)

            raise GenonetsError(
                ErrorCodes.INCONSISTENT_SEQ_LEN,
                "Line No. " + line_number)

        # Get the alphabet corresponding to the type received as
        # argument
        alphabet = SupportedAlphabet.getAlphabet(alphabet_type)

        # Verify alphabet
        if any(letter not in alphabet for letter in genotype):
            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.ALPHABET_TYPE_MISMATCH) +
                  ": Line No. " + line_number)

            raise GenonetsError(
                ErrorCodes.ALPHABET_TYPE_MISMATCH,
                "Line No. " + line_number)
Example #6
0
    def build_data_dicts(in_file_path, tau, alphabet_type):
        # Data structures to be returned
        data_dict = {}
        delta_dict = {}
        genotypes = []      # List of unique genotypes across all genotype sets
        genotype_sets = []  # List of genotype sets in the order in which they are read from file

        # Genotype length to be determined
        genotype_length = 0

        # Get handle to the input file and a DictReader for the file
        reader, in_file = InReader.dict_reader_for_file(in_file_path)

        # Check if all the required column headers are available in the file
        if not InReader.req_hdrs_are_present(reader.fieldnames):
            in_file.close()

            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.INCONSISTENT_HEADER))

            raise GenonetsError(ErrorCodes.INCONSISTENT_HEADER)

        # For each data row in the file,
        for row in reader:
            # Check for missing values in this row
            if any(row[col] in (None, "") for col in row.keys()):
                in_file.close()

                line_number = str(int(reader.line_num))

                print("Error: " +
                      ErrorCodes.getErrDescription(ErrorCodes.MISSING_VALUE) +
                      ": Line No. " + line_number)

                raise GenonetsError(ErrorCodes.MISSING_VALUE,
                                    "Line No. " + line_number)

            # Get the fitness score
            try:
                score = float(row["Score"])
            except:
                in_file.close()

                line_number = str(int(reader.line_num))

                print("Error: " +
                      ErrorCodes.getErrDescription(ErrorCodes.BAD_SCORE_FORMAT) +
                      ": Line No. " + line_number)

                raise GenonetsError(ErrorCodes.BAD_SCORE_FORMAT,
                                    "Line No. " + line_number)

            # If the score for this genotype is greater than or equal to
            # the given threshold,
            if score >= tau:
                # If the current genotype set has not already been added,
                if row["Genotypeset"] not in data_dict:
                    # Initialize dict for the genotype set
                    data_dict[row["Genotypeset"]] = {}

                    # Add the name of this genotype set to the ordered list of
                    # genotype set names
                    genotype_sets.append(row["Genotypeset"])

                    # Get the delta value
                    try:
                        delta = float(row["Delta"])
                    except:
                        in_file.close()

                        line_number = str(int(reader.line_num))

                        print("Error: " +
                              ErrorCodes.getErrDescription(ErrorCodes.BAD_DELTA_FORMAT) +
                              ": Line No. " + line_number)

                        raise GenonetsError(
                            ErrorCodes.BAD_DELTA_FORMAT,
                            "Line No. " + line_number)

                    delta_dict[row["Genotypeset"]] = delta

                # Get the genotype sequence
                genotype = row["Genotype"]

                # If genotype length has not been initialized yet, i.e., this is
                # the first row,
                if genotype_length == 0:
                    # Set length of the current genotype as the genotype length
                    # for the entire dataset
                    genotype_length = len(genotype)

                try:
                    InReader.verify_genotype(genotype, genotype_length, alphabet_type,
                                             str(int(reader.line_num)))
                except Exception as e:
                    in_file.close()
                    raise e

                # Add genotype as key and score as value to the current
                # genotype set
                data_dict[row["Genotypeset"]][genotype] = score

                # If the genotype has not already been read for any other
                # genotype set,
                if genotype not in genotypes:
                    # Add it to the list of unique sequences found in the
                    # input file
                    genotypes.append(genotype)

        in_file.close()

        # If no genotypes were found with score >= tau,
        if not data_dict:
            print("Error: " +
                  ErrorCodes.getErrDescription(ErrorCodes.NO_USABLE_SCORES) +
                  ": Tau=" + str(tau))

            raise GenonetsError(
                ErrorCodes.NO_USABLE_SCORES,
                "Tau=" + str(tau))

        # Dictionary: Key=Sequence, Value=[Genotype sets]. Reverse dictionary
        # that is used in functions like evolvability.
        genotype_to_set_dict = InReader.build_genotype_to_set_dict(genotypes, data_dict)

        return data_dict, delta_dict, genotype_to_set_dict, genotype_length, genotype_sets