Beispiel #1
0
 def pD(self):
     if self._p_label():
         self._pL()
     elif self._p_integer():
         self._pL()
     else:
         print_error(1)
Beispiel #2
0
def make_random_ids(used_ids_database, sample_ids):
    conn = sqlite3.connect(used_ids_database)
    cursor = conn.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS unique_ids (id integer)')
    # The set of IDs that have already been used
    used_ids = set([])
    # Grab all the IDs from the database
    for (next_used_id,) in cursor.execute('SELECT * from unique_ids'):
        used_ids.add(next_used_id)
    # The newly generated IDs by the call to this function
    new_ids = []
    # A list of pairs containing the original ID and its new randomised ID
    result = {}
    for old_sample in sample_ids:
        # Make sure the newly generated ID has not been seen before
        new_id = make_one_random_id()
        iter_count = 0
        while new_id in used_ids:
            if iter_count >= MAX_RANDOM_ID_ITERATIONS:
                print_error("Could not make a new random ID, iteration count exceeded")
                exit(ERROR_RANDOM_ID_ITERATIONS)
            new_id = make_random_id()
            iter_count += 1
        result[old_sample] = new_id
        # Record this new ID in the set of previously used IDs so we don't
        # use it again
        used_ids.add(new_id)
        new_ids.append(new_id)
    # Write the newly created IDs out to the database
    # XXX should be able to do this as a single INSERT statement
    for new_id in new_ids:
        cursor.execute('INSERT into unique_ids (id) VALUES ({})'.format(new_id))
    conn.commit()
    return result
Beispiel #3
0
 def split_sample_id(self):
     filename = self.get_filename()
     fields = self.get_fields(filename) 
     if len(fields) > 0:
         prefix_len = len(fields[0])
         return fields[0], filename[prefix_len:] 
     else:
         print_error("Cannot find sample ID in filename: {}".format(self.absolute_path))
         exit(BAD_FILENAME)
Beispiel #4
0
 def anonymise(self, randomised_ids):
     for sample in self.samples:
         old_id = sample["Sample_ID"]
         try:
             sample["Sample_ID"] = randomised_ids[old_id]
         except KeyError:
             print_error("Cannot anonymise sample {}".format(old_id))
             exit(ERROR_RANDOMISE_ID)
     self.update_sample_ids()
 def anonymise(self, randomised_ids):
     for sample in self.samples:
         old_id = sample['Sample_ID']
         try:
             sample['Sample_ID'] = randomised_ids[old_id]
         except KeyError:
             print_error("Cannot anonymise sample {}".format(old_id))
             exit(ERROR_RANDOMISE_ID)
     self.update_sample_ids()
Beispiel #6
0
 def split_sample_id(self):
     filename = self.get_filename()
     fields = self.get_fields(filename)
     if len(fields) > 0:
         prefix_len = len(fields[0])
         return fields[0], filename[prefix_len:]
     else:
         print_error("Cannot find sample ID in filename: {}".format(
             self.absolute_path))
         exit(BAD_FILENAME)
Beispiel #7
0
def create_app_dir(application):
    path = os.path.join(application.fields['application id'],
                        application.fields['request id'])
    try:
        os.makedirs(path)
    except OSError as e:
        print_error("failed to make directory {}".format(path))
        print(e, file=sys.stderr)
        exit(ERROR_MAKE_DIR)
    return path
Beispiel #8
0
 def _pR(self):
     if self._p_comma():
         self._pL()
     elif self._p_semicolon():
         if self.next_symbol != "$":
             print_error(4)
         else:
             self.is_matched = True
     else:
         print_error(3)
Beispiel #9
0
def anonymise_files(filenames: list[str],
                    randomised_ids: list[str],
                    application_dir: str,
                    filename_type: Data_filename,
                    file_editor=None):
    output_files = []
    randomised_batch_ids = {}

    for file_path in filenames:
        try:
            file_handler = filename_type(file_path)
        except FileTypeException:
            # skip this file
            continue
        else:
            # sample id
            old_id = file_handler.get_sample_id()
            if old_id is None or old_id not in randomised_ids:
                print_error("Cannot randomise this file: {}".format(file_path))
                exit(ERROR_RANDOMISE_ID)
            else:
                new_id = str(randomised_ids[old_id])
                file_handler.replace_sample_id(new_id)

            # Replace batch id AGRF_024 with XXXXX
            # Example filename: 010108101_AGRF_024_HG3JKBCXX_CGTACTAG_L001_R1.fastq.gz
            fields = file_handler.get_fields(file_handler.get_filename())

            # Example old_batch_id: AGRF_024
            old_batch_id = fields[1] + '_' + fields[2]

            # Key by old_batch_id because we want to make sure the same batch id gets the same new randomised batch id
            if old_batch_id not in randomised_batch_ids:
                new_batch_id = ''.join(
                    random.choice(string.ascii_lowercase + string.digits)
                    for x in range(5))
                randomised_batch_ids[old_batch_id] = new_batch_id

            # Replace AGRF_024 (field 1 and 2) with XXXXX
            file_handler.replace_field(randomised_batch_ids[old_batch_id], 1,
                                       2)

            # file_handler has updated filename (attribute of this object) at this point
            new_filename = file_handler.get_filename()
            new_path = os.path.join(application_dir, new_filename)
            output_files.append(new_path)
            if file_editor is not None:
                file_editor(old_id, new_id, file_path, new_path)
                logging.info("Anonymised {} to {}".format(file_path, new_path))
            else:
                os.symlink(file_path, new_path)
                logging.info("Linked {} to {}".format(new_path, file_path))

    return output_files
Beispiel #10
0
def md5_files(md5_command, filenames):
    for filename in filenames:
        output_filename = filename + ".md5"
        logging.info("{} {} > {}".format(md5_command, filename,
                                         output_filename))
        with open(output_filename, "w") as out_file:
            try:
                command = md5_command.split() + [filename]
                call(command, stdout=out_file)
            except OSError as e:
                print_error(e)
                exit(ERROR_MD5)
    def __init__ (self, filename):
        fields = json.load(filename)
 
        # Validate input JSON file against schema. This function exits the program if
        # the validation fails, otherwise we return a dictionary representing
        # the JSON file
        try:
            json_schema_filename = resource_filename(PROGRAM_NAME, os.path.join('data', JSON_SCHEMA))
        except Exception as e:
            print_error("JSON schema file not defined, program not installed correctly")
            exit(ERROR_JSON_SCHEMA_DEFINE)
        try:
            json_schema_file = open(json_schema_filename)
            json_schema = json.load(json_schema_file)
        except OSError as e:
            print_error("Cannot open JSON schema file: {}".format(json_schema_filename))
            print_error(str(e), file=sys.stderr)
            exit(ERROR_JSON_SCHEMA_OPEN)
        finally:
            json_schema_file.close()
        try:
            validate(fields, json_schema)
        except ValidationError as e:
            print_error("JSON input file is not valid: {}".format(filename))
            print(e, file=sys.stderr)
            exit(ERROR_JSON_SCHEMA_INVALID)

        # We only reach here if everything succeeded
        self.fields = fields 
Beispiel #12
0
    def split_sample_id(self):
        filename = self.get_filename()
        fields = self.get_fields(filename)
        if len(fields) > 0:
            # SAMPLEID_AGRF_111_HHMN7BCXX_TAAGGCGA_L001_R1.fastq.gz
            # get rid of _AGRF_111
            # prefix_len = len(fields[0]) + 1 + len(fields[1]) + 1 + len(fields[2])

            prefix_len = len(fields[0])
            return fields[0], filename[prefix_len:]
        else:
            print_error("Cannot find sample ID in filename: {}".format(
                self.absolute_path))
            exit(BAD_FILENAME)
 def allowed_data_types(self):
     '''Based on the input application, decide what data is available
     to the requestor.'''
     request = application_to_request(self.fields)
     try:
         allowed_combinations = REQUEST_COMBINATIONS[request]
     except KeyError:
         print_error("Application does not have a valid interpretation")
         print(format(json.dumps(self.fields, indent=4)), file=sys.stderr)
         exit(ERROR_INVALID_APPLICATION)
     # check that what was requested is compatible with what is available
     requested_identifiability = self.fields['identifiability']
     if requested_identifiability in allowed_combinations:
         return allowed_combinations 
     else:
         print_error("Requested identifiability {} is not compatible with allowed results {}".format(requested_identifiability, allowed_combinations))
         exit(ERROR_INCOMPATIBLE_REQUEST)
Beispiel #14
0
    def pD(self):
        is_error = True

        if self.next_symbol in ["label", "integer"]:
            self.next_symbol = self.scanner.get_next_symbol()
            if self._is_id():
                self.next_symbol = self.scanner.get_next_symbol()
                while self.next_symbol == ',':
                    self.next_symbol = self.scanner.get_next_symbol()
                    if self._is_id():
                        self.next_symbol = self.scanner.get_next_symbol()

                if self.next_symbol == ';':
                    self.next_symbol = self.scanner.get_next_symbol()
                    if self.next_symbol == "$":
                        is_error = False
        if is_error:
            print_error(5)

        self.is_matched = not is_error
Beispiel #15
0
def store_strings(code):
    code_as_string = "".join(code)
    stored_strings = []
    string_starts = []
    string_ends = []
    # loop through code string - when we see a " char, note it
    in_string = False
    line_number = 1
    for pos, char in enumerate(code_as_string):
        if char == "\"":
            if in_string:
                in_string = False
                string_ends.append(pos)
            else:
                in_string = True
                string_starts.append(pos)
        if char == "\n":
            if in_string:
                print_error(
                    line_number,
                    "Unterminated string. Strings cannot be longer than one line."
                )
            else:
                line_number += 1

    # now we know the locations of the strings, extract them (with their " chars)
    for i in range(len(string_starts)):
        stored_strings.append(
            code_as_string[(string_starts[i]):(string_ends[i] + 1)])

    # and replace them in the code with placeholders
    for i in range(len(string_starts)):
        code_as_string = code_as_string.replace(stored_strings[i],
                                                "string{}".format(i))

    # lastly, split code into list of strings again
    output_code = code_as_string.split("\n")

    return output_code, stored_strings
def make_random_ids(used_ids_database, sample_ids):
    conn = sqlite3.connect(used_ids_database)
    cursor = conn.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS unique_ids (id integer)')
    # The set of IDs that have already been used
    used_ids = set([])
    # Grab all the IDs from the database
    for (next_used_id, ) in cursor.execute('SELECT * from unique_ids'):
        used_ids.add(next_used_id)
    # The newly generated IDs by the call to this function
    new_ids = []
    # A list of pairs containing the original ID and its new randomised ID
    result = {}
    for old_sample in sample_ids:
        # Make sure the newly generated ID has not been seen before
        new_id = make_one_random_id()
        iter_count = 0
        while new_id in used_ids:
            if iter_count >= MAX_RANDOM_ID_ITERATIONS:
                print_error(
                    "Could not make a new random ID, iteration count exceeded")
                exit(ERROR_RANDOM_ID_ITERATIONS)
            new_id = make_one_random_id()
            iter_count += 1
        result[old_sample] = new_id
        # Record this new ID in the set of previously used IDs so we don't
        # use it again
        used_ids.add(new_id)
        new_ids.append(new_id)
    # Write the newly created IDs out to the database
    # XXX should be able to do this as a single INSERT statement
    for new_id in new_ids:
        cursor.execute(
            'INSERT into unique_ids (id) VALUES ({})'.format(new_id))
    conn.commit()
    return result
Beispiel #17
0
def function_dict_maker(code):
    """
    Turn parsed code into a dict of functions (and error if the functions don't make sense)
    """
    function_dict = dict()
    new_function_name = ""
    # find function names
    for index, statement in enumerate(code):
        if statement[0][-1] == ":":
            new_function_name = statement[0].strip(':')
            # check if new function name matches one of the previous ones
            if new_function_name in function_dict:
                print_error(
                    statement[-1],
                    "Function {} already defined.".format(new_function_name))
            function_start_index = index
        if statement[0] == "end":
            if len(statement) == 2:
                print_error(statement[-1], "Unresolved end statement.")
            else:
                if statement[1] == new_function_name:
                    # found a function body, collect it
                    function_end_index = index
                    new_function_body = []
                    for statement_index in range(function_start_index + 1,
                                                 function_end_index):
                        new_function_body.append(code[statement_index])
                    function_dict[new_function_name] = new_function_body

                else:
                    print_error(
                        index, "Unrecognised function name '{}'.".format(
                            statement[1]))

    # if main not in function names, abort
    if "main" not in function_dict:
        print_error(1, "No main function found.")

    return function_dict
### neuralNet.py ###
#
#
#

from sklearn.neural_network import MLPRegressor
import data, error

# Create neural network object
regr = MLPRegressor(max_iter=1000, hidden_layer_sizes=(128, 128), verbose=True)

# Train model
regr.fit(data.train_X, data.train_Y)

# Predict
prediction = regr.predict(data.test_X)

# Use error.py module to print mean squared error and average percent error
error.print_error(prediction, data.test_Y)
Beispiel #19
0
def main():
    args = parse_args()
    init_log(args.log)
    with open(args.app) as app_file:
        # parse and validate the requested data application JSON file
        application = Application(app_file)
        logging.info("Input data application parsed: {}".format(args.app))
        # Create output directory for the results
        application_dir = create_app_dir(application)
        # check what data types are allowed for this application
        allowed_data_types = application.allowed_data_types()
        logging.info("Allowed data types: {}".format(
            ' '.join(allowed_data_types)))
        if len(allowed_data_types) > 0:
            # Get all the sample metadata for all requested cohorts
            requested_cohorts = application.cohorts()
            metadata = Metadata(args.data, requested_cohorts)
            logging.info("Metadata collected for requested cohorts: {}".format(
                ' '.join(requested_cohorts)))
            metadata_sample_ids = sorted(metadata.get_sample_ids())
            logging.info("Metadata for sample IDs: {}".format(
                ' '.join(metadata_sample_ids)))
            # Filter the sample metadata based on patient consent
            metadata.filter_consent(args.consent, allowed_data_types)
            logging.warning("Consent not handled yet. FIXME")
            # Find all the file paths for requested file types for each
            # consented sample
            requested_file_types = application.file_types()
            logging.info("Requested file types: {}".format(
                ' '.join(requested_file_types)))
            fastqs, bams, bais, vcfs = get_files(args.data,
                                                 requested_file_types,
                                                 metadata)
            logging.info("VCF files selected:\n{}".format('\n'.join(vcfs)))
            logging.info("BAM files selected:\n{}".format('\n'.join(bams)))
            logging.info("BAI files selected:\n{}".format('\n'.join(bais)))
            logging.info("FASTQ files selected:\n{}".format('\n'.join(fastqs)))
            output_files = []
            if 'Anonymised' in allowed_data_types:
                # generate random IDs for all output samples
                randomised_ids = make_random_ids(args.usedids,
                                                 metadata.sample_ids)
                metadata.anonymise(randomised_ids)
                metadata.write(args.metaout)
                logging.info("Anonymised metadata written to: {}".format(
                    args.metaout))
                new_vcfs = anonymise_files(vcfs, randomised_ids,
                                           application_dir, VCF_filename,
                                           vcf_edit)
                new_bams = anonymise_files(bams, randomised_ids,
                                           application_dir, BAM_filename,
                                           bam_edit)
                # BAIs and FASTQs are just sym-linked to output with randomised name
                new_bais = anonymise_files(bais, randomised_ids,
                                           application_dir, BAI_filename)
                new_fastqs = anonymise_files(fastqs, randomised_ids,
                                             application_dir, FASTQ_filename)
                output_files.extend(new_vcfs + new_bams + new_bais +
                                    new_fastqs)
                logging.info("Output files are anonymised")
            elif 'Re-identifiable' in allowed_data_types:
                new_links = link_files(application_dir,
                                       vcfs + bams + bais + fastqs)
                output_files.extend(new_links)
                logging.info(
                    "Files linked in directory: {}".format(application_dir))
                metadata.write(args.metaout)
                logging.info("Output files are re-identifiable")
            else:
                print_error(
                    "Allowed data is neither anonymised nor re-identifiable")
                exit(ERROR_BAD_ALLOWED_DATA)
            logging.info("Generating MD5 checksums on output files")
            md5_files(args.md5, output_files)
        else:
            logging.warning("No data available for this application")
Beispiel #20
0
def assemble_instr(statement):
    """
    Transform a single statement into a list of Trytes, followed by the number of Trytes it
    assembled into.
    """
    instruction_dict = {
        "READ": handle_instr.READ,
        "WRITE": handle_instr.WRITE,
        "LOAD": handle_instr.LOAD,
        "SAVE": handle_instr.SAVE,
        "DSKSIZE": handle_instr.DSKSIZE,
        "PRINT": handle_instr.PRINT,
        "SHOW": handle_instr.SHOW,
        "TELL": handle_instr.TELL,
        "DSET": handle_instr.DSET,
        "DGET": handle_instr.DGET,
        "PEEK": handle_instr.PEEK,
        "FILL": handle_instr.FILL,
        "PUSH": handle_instr.PUSH,
        "POP": handle_instr.POP,
        "WHERE": handle_instr.WHERE,
        "SET": handle_instr.SET,
        "SWAP": handle_instr.SWAP,
        "CCMP": handle_instr.CCMP,
        "CCAR": handle_instr.CCAR,
        "COVF": handle_instr.COVF,
        "PRI": handle_instr.PRI,
        "CHK": handle_instr.CHK,
        "FLIP": handle_instr.FLIP,
        "INC": handle_instr.INC,
        "DEC": handle_instr.DEC,
        "ADD": handle_instr.ADD,
        "MUL": handle_instr.MUL,
        "DIV": handle_instr.DIV,
        "SHL": handle_instr.SHL,
        "SHR": handle_instr.SHR,
        "CMP": handle_instr.CMP,
        "AND": handle_instr.AND,
        "OR": handle_instr.OR,
        "XOR": handle_instr.XOR,
        "ABS": handle_instr.ABS,
        "NOT": handle_instr.NOT,
        "NOOP": handle_instr.NOOP,
        "JPZ": handle_instr.JPZ,
        "JPN": handle_instr.JPN,
        "JPP": handle_instr.JPP,
        "JP": handle_instr.JP,
        "JPS": handle_instr.JPS,
        "PJP": handle_instr.PJP,
        "THD": handle_instr.THD,
        "CYCL": handle_instr.CYCL,
        "TIME": handle_instr.TIME,
        "SETINT": handle_instr.SETINT,
        "HALT": handle_instr.HALT,
        "WAIT": handle_instr.WAIT,
        "CALL": handle_instr.CALL,
        "STRWRT": handle_instr.STRWRT,
        "STRPNT": handle_instr.STRPNT
    }
    if statement[0][0] == "!":
        output_trytes = [statement[0]]
        return [[statement[0]], 0]
    else:
        instruction = instruction_dict.get(statement[0])
        if not instruction:
            print_error(statement[-1],
                        "Unrecognised instruction '{}'.".format(statement[0]))
        output_trytes = instruction(statement)
        return [output_trytes] + [len(output_trytes)]
import sys
import multinom_data, error
from sklearn import linear_model
"""
Predict and print error
"""
# Create linear regression object
regr = linear_model.LinearRegression(fit_intercept=True,
                                     normalize=True,
                                     copy_X=False)

# Train the model
regr.fit(multinom_data.train_X, multinom_data.train_Y)

# Predict
prediction = regr.predict(multinom_data.test_X)

# Use error.py module to prin mean squared error and average percent error
error.print_error(prediction, multinom_data.test_Y)
Beispiel #22
0
 def _pL(self):
     if self._is_id():
         self._p_id()
         self._pR()
     else:
         print_error(2)