def writeInParamsToFile(paramsDict, path): fileName = path + "in_params.txt" # If the required directories have not already been created if not os.path.exists(os.path.dirname(path)): # Create the directories try: os.makedirs(os.path.dirname(path)) except os.error: print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.CANNOT_CREATE_DIRECTORY) + ": Path - " + path) raise GenonetsError(ErrorCodes.CANNOT_CREATE_DIRECTORY, "Path - " + path) # Open the file try: with open(fileName, "w") as outFile: # For each input parameter, for param in paramsDict.keys(): outFile.write(param + ": " + paramsDict[param] + "\n") except Exception: print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.CANNOT_WRITE_TO_FILE) + ": Path - " + path) raise GenonetsError(ErrorCodes.CANNOT_WRITE_TO_FILE)
def analyze(self, genotype_sets=Gc.ALL, analyses=Gc.ALL, parallel=False): """ Performs all analyses provided in the list of analysis types, on the given genotype sets. This method can only be used if `create` has already been called on the same `Genonets` object. :param genotype_sets: List of names of the genotype sets for which the genotype networks should be created. If a value is not explicitly specified for this parameter, genotype networks are constructed for all genotype sets available in the parsed data. :param analyses: List of analysis type constants. These constants are defined in the class `genonets.genonets_constants.AnalysisConstants`. If the value for this parameter is not explicitly set, all available analyses are performed. :param parallel: Flag to indicate whether or not parallel processing should be used. :return: No return value. """ if self.VERBOSE: sys.stdout.write("\nPerforming analyses:") # If all genotype_sets should be considered, if genotype_sets == Gc.ALL: # Get a list of all genotype_sets genotype_sets = self.genotype_sets() # If a single string is received, convert it into an iterable genotype_sets = [genotype_sets] if type(genotype_sets) == str else genotype_sets # If overlap in one of the requested analyses, there need to be at # at least two genotype_sets in the dataset if analyses == Gc.ALL or Ac.OVERLAP in analyses: if len(genotype_sets) < 2: print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.NOT_ENOUGH_REPS_OLAP) + ": Tau=" + str(self.cmdArgs.tau)) raise GenonetsError( ErrorCodes.NOT_ENOUGH_REPS_OLAP, "Tau=" + str(self.cmdArgs.tau)) # If multiprocessing should be used, if parallel: # Perform all analyses in parallel; overlap will be ignored. self._analyze_networks_parallel(genotype_sets, analyses) if analyses == Gc.ALL or Ac.OVERLAP in analyses: # Reset analysis handler to make sure it references # the updated dicts del self.analyzer self.analyzer = AnalysisHandler(self) # Use serial processing to perform overlap analysis self._analyze_networks(genotype_sets, [Ac.OVERLAP]) else: # Perform all analyses using serial processing self._analyze_networks(genotype_sets, analyses)
def __init__(self, arguments): # Molecule type: RNA, DNA, Protein, etc. self.moleculeType = arguments.alphabetType # 'Use reverse complements' flag self.use_reverse_complements = True if arguments.use_reverse_complements else False # Report exception if 'use_reverse_complements' has been passed as an argument with # alphabet type other than DNA if self.use_reverse_complements and self.moleculeType != "DNA": print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.RC_ALPHABET_MISMATCH)) raise GenonetsError(ErrorCodes.RC_ALPHABET_MISMATCH) # Flag to indicate whether shift mutations should # be considered if arguments.includeIndels.lower() == "true": self.useIndels = True else: self.useIndels = False # Path to the input file self.inFilePath = arguments.inFilePath # Lower bound on fitness values to be used. self.tau = arguments.tau # Path to the output folder self.outPath = arguments.outPath # Make sure the path ends with "/", since this is needed # in the file writing routines if not self.outPath.endswith("/"): self.outPath += "/" # Maximum number of parallel processes to be used self.num_procs = arguments.num_procs # Verbose flag self.verbose = True if arguments.verbose else False # Create a dictionary of parameters paramsDict = { "alphabetType": self.moleculeType, "includeIndels": str(self.useIndels), "inFilePath": self.inFilePath, "tau": str(self.tau), "outPath": self.outPath, "useReverseComplements": str(self.use_reverse_complements), "num_procs": str(self.num_procs), "verbose": str(self.verbose) } # Print the parsed parameter values self.printInParams(paramsDict) # Write input parameters to file Writer.writeInParamsToFile(paramsDict, self.outPath)
def dict_reader_for_file(file_name): # Open file try: data_file = open(file_name, 'rU') except Exception as e: print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.UNKNOWN_PARSING_ERROR)) raise GenonetsError(ErrorCodes.UNKNOWN_PARSING_ERROR) # Read the file into a dictionary reader = csv.DictReader(data_file, delimiter="\t") return reader, data_file
def verify_genotype(genotype, genotype_length, alphabet_type, line_number): # Verify the length if len(genotype) != genotype_length: print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.INCONSISTENT_SEQ_LEN) + ": Line No. " + line_number) raise GenonetsError( ErrorCodes.INCONSISTENT_SEQ_LEN, "Line No. " + line_number) # Get the alphabet corresponding to the type received as # argument alphabet = SupportedAlphabet.getAlphabet(alphabet_type) # Verify alphabet if any(letter not in alphabet for letter in genotype): print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.ALPHABET_TYPE_MISMATCH) + ": Line No. " + line_number) raise GenonetsError( ErrorCodes.ALPHABET_TYPE_MISMATCH, "Line No. " + line_number)
def build_data_dicts(in_file_path, tau, alphabet_type): # Data structures to be returned data_dict = {} delta_dict = {} genotypes = [] # List of unique genotypes across all genotype sets genotype_sets = [] # List of genotype sets in the order in which they are read from file # Genotype length to be determined genotype_length = 0 # Get handle to the input file and a DictReader for the file reader, in_file = InReader.dict_reader_for_file(in_file_path) # Check if all the required column headers are available in the file if not InReader.req_hdrs_are_present(reader.fieldnames): in_file.close() print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.INCONSISTENT_HEADER)) raise GenonetsError(ErrorCodes.INCONSISTENT_HEADER) # For each data row in the file, for row in reader: # Check for missing values in this row if any(row[col] in (None, "") for col in row.keys()): in_file.close() line_number = str(int(reader.line_num)) print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.MISSING_VALUE) + ": Line No. " + line_number) raise GenonetsError(ErrorCodes.MISSING_VALUE, "Line No. " + line_number) # Get the fitness score try: score = float(row["Score"]) except: in_file.close() line_number = str(int(reader.line_num)) print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.BAD_SCORE_FORMAT) + ": Line No. " + line_number) raise GenonetsError(ErrorCodes.BAD_SCORE_FORMAT, "Line No. " + line_number) # If the score for this genotype is greater than or equal to # the given threshold, if score >= tau: # If the current genotype set has not already been added, if row["Genotypeset"] not in data_dict: # Initialize dict for the genotype set data_dict[row["Genotypeset"]] = {} # Add the name of this genotype set to the ordered list of # genotype set names genotype_sets.append(row["Genotypeset"]) # Get the delta value try: delta = float(row["Delta"]) except: in_file.close() line_number = str(int(reader.line_num)) print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.BAD_DELTA_FORMAT) + ": Line No. " + line_number) raise GenonetsError( ErrorCodes.BAD_DELTA_FORMAT, "Line No. " + line_number) delta_dict[row["Genotypeset"]] = delta # Get the genotype sequence genotype = row["Genotype"] # If genotype length has not been initialized yet, i.e., this is # the first row, if genotype_length == 0: # Set length of the current genotype as the genotype length # for the entire dataset genotype_length = len(genotype) try: InReader.verify_genotype(genotype, genotype_length, alphabet_type, str(int(reader.line_num))) except Exception as e: in_file.close() raise e # Add genotype as key and score as value to the current # genotype set data_dict[row["Genotypeset"]][genotype] = score # If the genotype has not already been read for any other # genotype set, if genotype not in genotypes: # Add it to the list of unique sequences found in the # input file genotypes.append(genotype) in_file.close() # If no genotypes were found with score >= tau, if not data_dict: print("Error: " + ErrorCodes.getErrDescription(ErrorCodes.NO_USABLE_SCORES) + ": Tau=" + str(tau)) raise GenonetsError( ErrorCodes.NO_USABLE_SCORES, "Tau=" + str(tau)) # Dictionary: Key=Sequence, Value=[Genotype sets]. Reverse dictionary # that is used in functions like evolvability. genotype_to_set_dict = InReader.build_genotype_to_set_dict(genotypes, data_dict) return data_dict, delta_dict, genotype_to_set_dict, genotype_length, genotype_sets