def pD(self): if self._p_label(): self._pL() elif self._p_integer(): self._pL() else: print_error(1)
def make_random_ids(used_ids_database, sample_ids): conn = sqlite3.connect(used_ids_database) cursor = conn.cursor() cursor.execute('CREATE TABLE IF NOT EXISTS unique_ids (id integer)') # The set of IDs that have already been used used_ids = set([]) # Grab all the IDs from the database for (next_used_id,) in cursor.execute('SELECT * from unique_ids'): used_ids.add(next_used_id) # The newly generated IDs by the call to this function new_ids = [] # A list of pairs containing the original ID and its new randomised ID result = {} for old_sample in sample_ids: # Make sure the newly generated ID has not been seen before new_id = make_one_random_id() iter_count = 0 while new_id in used_ids: if iter_count >= MAX_RANDOM_ID_ITERATIONS: print_error("Could not make a new random ID, iteration count exceeded") exit(ERROR_RANDOM_ID_ITERATIONS) new_id = make_random_id() iter_count += 1 result[old_sample] = new_id # Record this new ID in the set of previously used IDs so we don't # use it again used_ids.add(new_id) new_ids.append(new_id) # Write the newly created IDs out to the database # XXX should be able to do this as a single INSERT statement for new_id in new_ids: cursor.execute('INSERT into unique_ids (id) VALUES ({})'.format(new_id)) conn.commit() return result
def split_sample_id(self): filename = self.get_filename() fields = self.get_fields(filename) if len(fields) > 0: prefix_len = len(fields[0]) return fields[0], filename[prefix_len:] else: print_error("Cannot find sample ID in filename: {}".format(self.absolute_path)) exit(BAD_FILENAME)
def anonymise(self, randomised_ids): for sample in self.samples: old_id = sample["Sample_ID"] try: sample["Sample_ID"] = randomised_ids[old_id] except KeyError: print_error("Cannot anonymise sample {}".format(old_id)) exit(ERROR_RANDOMISE_ID) self.update_sample_ids()
def anonymise(self, randomised_ids): for sample in self.samples: old_id = sample['Sample_ID'] try: sample['Sample_ID'] = randomised_ids[old_id] except KeyError: print_error("Cannot anonymise sample {}".format(old_id)) exit(ERROR_RANDOMISE_ID) self.update_sample_ids()
def split_sample_id(self): filename = self.get_filename() fields = self.get_fields(filename) if len(fields) > 0: prefix_len = len(fields[0]) return fields[0], filename[prefix_len:] else: print_error("Cannot find sample ID in filename: {}".format( self.absolute_path)) exit(BAD_FILENAME)
def create_app_dir(application): path = os.path.join(application.fields['application id'], application.fields['request id']) try: os.makedirs(path) except OSError as e: print_error("failed to make directory {}".format(path)) print(e, file=sys.stderr) exit(ERROR_MAKE_DIR) return path
def _pR(self): if self._p_comma(): self._pL() elif self._p_semicolon(): if self.next_symbol != "$": print_error(4) else: self.is_matched = True else: print_error(3)
def anonymise_files(filenames: list[str], randomised_ids: list[str], application_dir: str, filename_type: Data_filename, file_editor=None): output_files = [] randomised_batch_ids = {} for file_path in filenames: try: file_handler = filename_type(file_path) except FileTypeException: # skip this file continue else: # sample id old_id = file_handler.get_sample_id() if old_id is None or old_id not in randomised_ids: print_error("Cannot randomise this file: {}".format(file_path)) exit(ERROR_RANDOMISE_ID) else: new_id = str(randomised_ids[old_id]) file_handler.replace_sample_id(new_id) # Replace batch id AGRF_024 with XXXXX # Example filename: 010108101_AGRF_024_HG3JKBCXX_CGTACTAG_L001_R1.fastq.gz fields = file_handler.get_fields(file_handler.get_filename()) # Example old_batch_id: AGRF_024 old_batch_id = fields[1] + '_' + fields[2] # Key by old_batch_id because we want to make sure the same batch id gets the same new randomised batch id if old_batch_id not in randomised_batch_ids: new_batch_id = ''.join( random.choice(string.ascii_lowercase + string.digits) for x in range(5)) randomised_batch_ids[old_batch_id] = new_batch_id # Replace AGRF_024 (field 1 and 2) with XXXXX file_handler.replace_field(randomised_batch_ids[old_batch_id], 1, 2) # file_handler has updated filename (attribute of this object) at this point new_filename = file_handler.get_filename() new_path = os.path.join(application_dir, new_filename) output_files.append(new_path) if file_editor is not None: file_editor(old_id, new_id, file_path, new_path) logging.info("Anonymised {} to {}".format(file_path, new_path)) else: os.symlink(file_path, new_path) logging.info("Linked {} to {}".format(new_path, file_path)) return output_files
def md5_files(md5_command, filenames): for filename in filenames: output_filename = filename + ".md5" logging.info("{} {} > {}".format(md5_command, filename, output_filename)) with open(output_filename, "w") as out_file: try: command = md5_command.split() + [filename] call(command, stdout=out_file) except OSError as e: print_error(e) exit(ERROR_MD5)
def __init__ (self, filename): fields = json.load(filename) # Validate input JSON file against schema. This function exits the program if # the validation fails, otherwise we return a dictionary representing # the JSON file try: json_schema_filename = resource_filename(PROGRAM_NAME, os.path.join('data', JSON_SCHEMA)) except Exception as e: print_error("JSON schema file not defined, program not installed correctly") exit(ERROR_JSON_SCHEMA_DEFINE) try: json_schema_file = open(json_schema_filename) json_schema = json.load(json_schema_file) except OSError as e: print_error("Cannot open JSON schema file: {}".format(json_schema_filename)) print_error(str(e), file=sys.stderr) exit(ERROR_JSON_SCHEMA_OPEN) finally: json_schema_file.close() try: validate(fields, json_schema) except ValidationError as e: print_error("JSON input file is not valid: {}".format(filename)) print(e, file=sys.stderr) exit(ERROR_JSON_SCHEMA_INVALID) # We only reach here if everything succeeded self.fields = fields
def split_sample_id(self): filename = self.get_filename() fields = self.get_fields(filename) if len(fields) > 0: # SAMPLEID_AGRF_111_HHMN7BCXX_TAAGGCGA_L001_R1.fastq.gz # get rid of _AGRF_111 # prefix_len = len(fields[0]) + 1 + len(fields[1]) + 1 + len(fields[2]) prefix_len = len(fields[0]) return fields[0], filename[prefix_len:] else: print_error("Cannot find sample ID in filename: {}".format( self.absolute_path)) exit(BAD_FILENAME)
def allowed_data_types(self): '''Based on the input application, decide what data is available to the requestor.''' request = application_to_request(self.fields) try: allowed_combinations = REQUEST_COMBINATIONS[request] except KeyError: print_error("Application does not have a valid interpretation") print(format(json.dumps(self.fields, indent=4)), file=sys.stderr) exit(ERROR_INVALID_APPLICATION) # check that what was requested is compatible with what is available requested_identifiability = self.fields['identifiability'] if requested_identifiability in allowed_combinations: return allowed_combinations else: print_error("Requested identifiability {} is not compatible with allowed results {}".format(requested_identifiability, allowed_combinations)) exit(ERROR_INCOMPATIBLE_REQUEST)
def pD(self): is_error = True if self.next_symbol in ["label", "integer"]: self.next_symbol = self.scanner.get_next_symbol() if self._is_id(): self.next_symbol = self.scanner.get_next_symbol() while self.next_symbol == ',': self.next_symbol = self.scanner.get_next_symbol() if self._is_id(): self.next_symbol = self.scanner.get_next_symbol() if self.next_symbol == ';': self.next_symbol = self.scanner.get_next_symbol() if self.next_symbol == "$": is_error = False if is_error: print_error(5) self.is_matched = not is_error
def store_strings(code): code_as_string = "".join(code) stored_strings = [] string_starts = [] string_ends = [] # loop through code string - when we see a " char, note it in_string = False line_number = 1 for pos, char in enumerate(code_as_string): if char == "\"": if in_string: in_string = False string_ends.append(pos) else: in_string = True string_starts.append(pos) if char == "\n": if in_string: print_error( line_number, "Unterminated string. Strings cannot be longer than one line." ) else: line_number += 1 # now we know the locations of the strings, extract them (with their " chars) for i in range(len(string_starts)): stored_strings.append( code_as_string[(string_starts[i]):(string_ends[i] + 1)]) # and replace them in the code with placeholders for i in range(len(string_starts)): code_as_string = code_as_string.replace(stored_strings[i], "string{}".format(i)) # lastly, split code into list of strings again output_code = code_as_string.split("\n") return output_code, stored_strings
def make_random_ids(used_ids_database, sample_ids): conn = sqlite3.connect(used_ids_database) cursor = conn.cursor() cursor.execute('CREATE TABLE IF NOT EXISTS unique_ids (id integer)') # The set of IDs that have already been used used_ids = set([]) # Grab all the IDs from the database for (next_used_id, ) in cursor.execute('SELECT * from unique_ids'): used_ids.add(next_used_id) # The newly generated IDs by the call to this function new_ids = [] # A list of pairs containing the original ID and its new randomised ID result = {} for old_sample in sample_ids: # Make sure the newly generated ID has not been seen before new_id = make_one_random_id() iter_count = 0 while new_id in used_ids: if iter_count >= MAX_RANDOM_ID_ITERATIONS: print_error( "Could not make a new random ID, iteration count exceeded") exit(ERROR_RANDOM_ID_ITERATIONS) new_id = make_one_random_id() iter_count += 1 result[old_sample] = new_id # Record this new ID in the set of previously used IDs so we don't # use it again used_ids.add(new_id) new_ids.append(new_id) # Write the newly created IDs out to the database # XXX should be able to do this as a single INSERT statement for new_id in new_ids: cursor.execute( 'INSERT into unique_ids (id) VALUES ({})'.format(new_id)) conn.commit() return result
def function_dict_maker(code): """ Turn parsed code into a dict of functions (and error if the functions don't make sense) """ function_dict = dict() new_function_name = "" # find function names for index, statement in enumerate(code): if statement[0][-1] == ":": new_function_name = statement[0].strip(':') # check if new function name matches one of the previous ones if new_function_name in function_dict: print_error( statement[-1], "Function {} already defined.".format(new_function_name)) function_start_index = index if statement[0] == "end": if len(statement) == 2: print_error(statement[-1], "Unresolved end statement.") else: if statement[1] == new_function_name: # found a function body, collect it function_end_index = index new_function_body = [] for statement_index in range(function_start_index + 1, function_end_index): new_function_body.append(code[statement_index]) function_dict[new_function_name] = new_function_body else: print_error( index, "Unrecognised function name '{}'.".format( statement[1])) # if main not in function names, abort if "main" not in function_dict: print_error(1, "No main function found.") return function_dict
### neuralNet.py ### # # # from sklearn.neural_network import MLPRegressor import data, error # Create neural network object regr = MLPRegressor(max_iter=1000, hidden_layer_sizes=(128, 128), verbose=True) # Train model regr.fit(data.train_X, data.train_Y) # Predict prediction = regr.predict(data.test_X) # Use error.py module to print mean squared error and average percent error error.print_error(prediction, data.test_Y)
def main(): args = parse_args() init_log(args.log) with open(args.app) as app_file: # parse and validate the requested data application JSON file application = Application(app_file) logging.info("Input data application parsed: {}".format(args.app)) # Create output directory for the results application_dir = create_app_dir(application) # check what data types are allowed for this application allowed_data_types = application.allowed_data_types() logging.info("Allowed data types: {}".format( ' '.join(allowed_data_types))) if len(allowed_data_types) > 0: # Get all the sample metadata for all requested cohorts requested_cohorts = application.cohorts() metadata = Metadata(args.data, requested_cohorts) logging.info("Metadata collected for requested cohorts: {}".format( ' '.join(requested_cohorts))) metadata_sample_ids = sorted(metadata.get_sample_ids()) logging.info("Metadata for sample IDs: {}".format( ' '.join(metadata_sample_ids))) # Filter the sample metadata based on patient consent metadata.filter_consent(args.consent, allowed_data_types) logging.warning("Consent not handled yet. FIXME") # Find all the file paths for requested file types for each # consented sample requested_file_types = application.file_types() logging.info("Requested file types: {}".format( ' '.join(requested_file_types))) fastqs, bams, bais, vcfs = get_files(args.data, requested_file_types, metadata) logging.info("VCF files selected:\n{}".format('\n'.join(vcfs))) logging.info("BAM files selected:\n{}".format('\n'.join(bams))) logging.info("BAI files selected:\n{}".format('\n'.join(bais))) logging.info("FASTQ files selected:\n{}".format('\n'.join(fastqs))) output_files = [] if 'Anonymised' in allowed_data_types: # generate random IDs for all output samples randomised_ids = make_random_ids(args.usedids, metadata.sample_ids) metadata.anonymise(randomised_ids) metadata.write(args.metaout) logging.info("Anonymised metadata written to: {}".format( args.metaout)) new_vcfs = anonymise_files(vcfs, randomised_ids, application_dir, VCF_filename, vcf_edit) new_bams = anonymise_files(bams, randomised_ids, application_dir, BAM_filename, bam_edit) # BAIs and FASTQs are just sym-linked to output with randomised name new_bais = anonymise_files(bais, randomised_ids, application_dir, BAI_filename) new_fastqs = anonymise_files(fastqs, randomised_ids, application_dir, FASTQ_filename) output_files.extend(new_vcfs + new_bams + new_bais + new_fastqs) logging.info("Output files are anonymised") elif 'Re-identifiable' in allowed_data_types: new_links = link_files(application_dir, vcfs + bams + bais + fastqs) output_files.extend(new_links) logging.info( "Files linked in directory: {}".format(application_dir)) metadata.write(args.metaout) logging.info("Output files are re-identifiable") else: print_error( "Allowed data is neither anonymised nor re-identifiable") exit(ERROR_BAD_ALLOWED_DATA) logging.info("Generating MD5 checksums on output files") md5_files(args.md5, output_files) else: logging.warning("No data available for this application")
def assemble_instr(statement): """ Transform a single statement into a list of Trytes, followed by the number of Trytes it assembled into. """ instruction_dict = { "READ": handle_instr.READ, "WRITE": handle_instr.WRITE, "LOAD": handle_instr.LOAD, "SAVE": handle_instr.SAVE, "DSKSIZE": handle_instr.DSKSIZE, "PRINT": handle_instr.PRINT, "SHOW": handle_instr.SHOW, "TELL": handle_instr.TELL, "DSET": handle_instr.DSET, "DGET": handle_instr.DGET, "PEEK": handle_instr.PEEK, "FILL": handle_instr.FILL, "PUSH": handle_instr.PUSH, "POP": handle_instr.POP, "WHERE": handle_instr.WHERE, "SET": handle_instr.SET, "SWAP": handle_instr.SWAP, "CCMP": handle_instr.CCMP, "CCAR": handle_instr.CCAR, "COVF": handle_instr.COVF, "PRI": handle_instr.PRI, "CHK": handle_instr.CHK, "FLIP": handle_instr.FLIP, "INC": handle_instr.INC, "DEC": handle_instr.DEC, "ADD": handle_instr.ADD, "MUL": handle_instr.MUL, "DIV": handle_instr.DIV, "SHL": handle_instr.SHL, "SHR": handle_instr.SHR, "CMP": handle_instr.CMP, "AND": handle_instr.AND, "OR": handle_instr.OR, "XOR": handle_instr.XOR, "ABS": handle_instr.ABS, "NOT": handle_instr.NOT, "NOOP": handle_instr.NOOP, "JPZ": handle_instr.JPZ, "JPN": handle_instr.JPN, "JPP": handle_instr.JPP, "JP": handle_instr.JP, "JPS": handle_instr.JPS, "PJP": handle_instr.PJP, "THD": handle_instr.THD, "CYCL": handle_instr.CYCL, "TIME": handle_instr.TIME, "SETINT": handle_instr.SETINT, "HALT": handle_instr.HALT, "WAIT": handle_instr.WAIT, "CALL": handle_instr.CALL, "STRWRT": handle_instr.STRWRT, "STRPNT": handle_instr.STRPNT } if statement[0][0] == "!": output_trytes = [statement[0]] return [[statement[0]], 0] else: instruction = instruction_dict.get(statement[0]) if not instruction: print_error(statement[-1], "Unrecognised instruction '{}'.".format(statement[0])) output_trytes = instruction(statement) return [output_trytes] + [len(output_trytes)]
import sys import multinom_data, error from sklearn import linear_model """ Predict and print error """ # Create linear regression object regr = linear_model.LinearRegression(fit_intercept=True, normalize=True, copy_X=False) # Train the model regr.fit(multinom_data.train_X, multinom_data.train_Y) # Predict prediction = regr.predict(multinom_data.test_X) # Use error.py module to prin mean squared error and average percent error error.print_error(prediction, multinom_data.test_Y)
def _pL(self): if self._is_id(): self._p_id() self._pR() else: print_error(2)