Ejemplo n.º 1
0
	def _add_arguments(self, parser):
		Command._add_arguments(self, parser)

		parser.add_argument("data_path", metavar="DATA",
							help="File containing the data matrix in TDM format")

		parser.add_argument("-N", "--samplings", dest="num_samplings", type=int, default=10000, metavar="NUMBER",
							help="Number of samplings to compute the FM bias pvalue")

		parser.add_argument("-e", "--estimator", dest="estimator", metavar="ESTIMATOR",
							choices=["mean", "median"], default="mean",
							help="Test estimator for computation.")
		
		parser.add_argument("--gt", "--gene-threshold", dest="mut_gene_threshold", type=int, default=2, metavar="THRESHOLD",
							help="Minimum number of mutations per gene to compute the FM bias")

		parser.add_argument("--pt", "--pathway-threshold", dest="mut_pathway_threshold", type=int, default=10, metavar="THRESHOLD",
							help="Minimum number of mutations per pathway to compute the FM bias")

		parser.add_argument("-s", "--slices", dest="slices", metavar="SLICES",
							help="Slices to process separated by commas")

		parser.add_argument("-m", "--mapping", dest="mapping", metavar="PATH",
							help="File with mappings between genes and pathways to be analysed")

		parser.add_argument("-f", "--filter", dest="filter", metavar="PATH",
							help="File containing the features to be filtered. By default labels are includes,"
								 " labels preceded with - are excludes.")

		parser.add_argument("--save-data", dest="save_data", default=False, action="store_true",
							help="The input data matrix will be saved")

		parser.add_argument("--save-analysis", dest="save_analysis", default=False, action="store_true",
							help="The analysis results will be saved")
Ejemplo n.º 2
0
	def run(self):
		Command.run(self)

		# Load data

		self.log.info("Loading data ...")

		#TODO: Allow to specify the name of the column to load from data files: --data-column=PVALUE && /file.tsv,column=PVALUE
		#TODO: Allow TDM format

		row_names, col_names, data, method = self.load_data(self.args.data_paths, self.args.method)

		self.log.debug("  {0} rows, {1} columns to combine with method '{2}'".format(
							len(row_names), len(col_names), method or "unknown"))

		if method is None:
			self.log.error("Method of combination not defined. Use -m to define it.")
			exit(-1)

		method = create_method(method)

		if self.args.save_data:
			self.log.info("Saving data matrix ...")
			self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
						   row_names, col_names, data, suffix="data")

		self.log.info("Combining data using method '{0}' ...".format(method.name))

		combined_results = method.combine(np.ma.masked_invalid(data))

		self.log.info("Saving combined results ...")
		self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
						 row_names, method.combination_columns, combined_results.T,
						 params=[("slices", ",".join(col_names)), ("method", method.name)],
						 valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0)
def process_command_output(queue, session):
    while True:
        #print("Inside Process Command")
        if queue.empty():
            break
        command = queue.get()
        print(command)
        command = command.replace('\n', '')
        execute = list(bashlex.split(command))
        if ';' in command:
            p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
            test = datetime.now()
            try:
                start_time = datetime.now()
                outs = p.communicate(timeout=60)
                diff = (datetime.now() - start_time).total_seconds()
                if isinstance(outs[0], bytes):
                    outs = outs[0].decode()
                #print(outs)
                value = Command(command.replace("'", "''"), len(command),
                                round(diff, 3),
                                bytes(outs.replace("'", "''"), 'utf8'))
                session.add(value)
            except subprocess.TimeoutExpired:
                p.kill()
                outs = p.communicate()
                diff = (datetime.now() - start_time).total_seconds()
                if isinstance(outs[0], bytes):
                    outs = outs[0].decode()
                #print(outs)
                value = Command(command.replace("'", "''"), len(command), 0,
                                bytes(outs.replace("'", "''"), 'utf8'))
                session.add(value)
            except subprocess.CalledProcessError:
                pass
        else:
            output = ""
            try:
                start_time = datetime.now()
                output = subprocess.check_output(
                    execute, timeout=60)  #check for timeout
                diff = (datetime.now() - start_time).total_seconds()
                if isinstance(output, bytes):
                    output = output.decode()
                #print(output)
                value = Command(command.replace("'", "''"), len(command),
                                round(diff, 3),
                                bytes(output.replace("'", "''"), 'utf8'))
                session.add(value)
            except subprocess.TimeoutExpired:
                #print(output)
                if isinstance(output, bytes):
                    output = output.decode()
                value = Command(command.replace("'", "''"), len(command), 0,
                                bytes(output.replace("'", "''"), 'utf8'))
                session.add(value)
            except subprocess.CalledProcessError:
                pass
        session.commit()
Ejemplo n.º 4
0
	def _check_args(self):
		Command._check_args(self)

		if self.args.analysis_name is None:
			names = [os.path.splitext(os.path.basename(path))[0] for path in self.args.data_paths]
			prefix = os.path.commonprefix(names)
			if prefix.endswith("-"):
				prefix = prefix[:-1]
			if len(prefix) == 0:
				prefix = "oncodrivefm"
			self.args.analysis_name = prefix
Ejemplo n.º 5
0
def process_command_output(queue):
    # TODO: run the command and put its data in the db
    while (not queue.empty()):
        command = queue.get()
        # create an object of the table
        c = Command(command, len(command), 0, 'fetching results...')
        # check whether the same command exists in the table and if it does do not the command again to the database
        command_db = session.query(Command).filter_by(
            command_string=command).first()
        if (command_db is None):
            session.add(c)
            session.commit()
        # create a temp file that does not exist already
        file = open('test_' + str(os.getpid()) + '.sh', 'w')
        file.write(command)
        file.close()

        start = time.time()
        p = subprocess.Popen(['sh', 'test_' + str(os.getpid()) + '.sh'],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)

        out = ''

        # poll the subprocess created until the returncode is not None or until the time is over 60 seconds
        while (p.returncode == None and time.time() - start < 60):
            p.poll()
            #read from stdout and store it in out
            out = out + p.stdout.readline()
        t = time.time() - start

        # if return code of the process is still None kill the process
        if (p.returncode == None):
            p.kill()

        # if time taken to run the command is over 60 seconds then assign the time taken as 0
        if (t > 60):
            t = 0.0

        # create an object of the table
        c = Command(command, len(command), math.ceil(t), out)
        # check whether the same command exists in the table and if it does do not the command again to the database
        command_db = session.query(Command).filter_by(
            command_string=command).first()
        if (command_db is None):
            session.add(c)
            session.commit()
        else:
            if (command_db.output == 'fetching results...'):
                command_db.output = out
                command_db.duration = t
                session.commit()
        #remove the temporary file that was created
        os.remove('test_' + str(os.getpid()) + '.sh')
Ejemplo n.º 6
0
	def _add_arguments(self, parser):
		Command._add_arguments(self, parser)

		parser.add_argument("data_paths", metavar="DATA", nargs="+",
							help="Files with the results to be combined")

		parser.add_argument("-m", dest="method", metavar="NAME",
							choices=method_names(),
							help="The NAME of the method to combine values")

		parser.add_argument("--save-data", dest="save_data", default=False, action="store_true",
							help="The input data matrix will be saved")
Ejemplo n.º 7
0
	def _check_args(self):
		Command._check_args(self)

		if self.args.analysis_name is None:
			self.args.analysis_name, ext = os.path.splitext(os.path.basename(self.args.data_path))

		if self.args.num_samplings < 1:
			self._error("Number of samplings out of range [2, ..)")

		if self.args.mut_threshold < 1:
			self._error("Minimum number of mutations out of range [1, ..)")

		if self.args.filter is not None:
			if not os.path.exists(self.args.filter):
				self._error("Filter file not found: {0}".format(self.args.filter))
Ejemplo n.º 8
0
def process_command_output(queue):
    # TODO: execute the command and put its data in the db

    q = queue

    # Initialise list that will hold the database entry objects
    put_results_database = []
    while not q.empty():
        work = q.get(True)
        try:
            # execute is the command string to be executed
            execute = work
            # Timing the process of executing each command
            tic = time.clock()
            process = subprocess.run(execute,shell=True, timeout=60, stdout=subprocess.PIPE)
            toc = time.clock()
            # Storing the meta-data in each column of the defind database
            output = process.stdout
            duration = toc-tic
            length = len(execute)
            command_string = execute
        # Specifying the exception condition when the commands takes greater than 1 minute to process
        except subprocess.TimeoutExpired as e:
            print('long running or not finished scenario')
            duration = 0
            output = e.stdout

        # Appending the meta-data object in list
        result_entry = Command(command_string, length, duration, output)
        put_results_database.append(result_entry)

    # Putting results in database
    session.add_all(put_results_database)
    session.commit()
Ejemplo n.º 9
0
def process_command_output(queue):
    # TODO: run the command and put its data in the db
    command = queue.get()
    start_time = time.time()
    try:
        proc = Popen(command,
                     shell=True,
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE,
                     preexec_fn=os.setsid)
        longer_exec_command = False
        #For longer running commands
        while proc.poll() is None:
            exec_time = time.time() - start_time
            if exec_time > 60:
                #kill the command execution
                os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
                longer_exec_command = True
        stdout, stderr = proc.communicate()

        duration = 0 if longer_exec_command else time.time() - start_time
        length = len(command)
        output = stdout if not stderr else stderr

        cmd_metadata = Command(command, length, duration, output)
        dbsession = Session()
        dbsession.add(cmd_metadata)
        dbsession.commit()
    except Exception as e:
        raise Exception("DB insertion failed...")
def process_command_output(queue):
    # TODO: run the command and put its data in the db
    command = queue.get()
    a = datetime.datetime.now()
    output = b''
    totalTime = 0
    try:
        p = subprocess.Popen(command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        try:
            p.wait(60)
            for line in p.stdout.readlines():
                output = output + line
            totalTime = (datetime.datetime.now() - a).total_seconds()
        except Exception:
            p.kill()
            logging.error("Command :" + command +
                          "Not Executed Successfully due to Timeout")
            output = b''

        cmd = Command(command, len(command), totalTime, output)
        session.add(cmd)
        session.commit()
    except Exception as e:
        logging.error("Exception in process_command_output:")
Ejemplo n.º 11
0
def process_shell_command(command_name):
    logger = get_task_logger(__name__)
    start_time = time.time()
    logger.info('Command currently being executed is {}'.format(command_name))
    try:
        process = sb.Popen(command_name,
                           shell=True,
                           stdin=sb.PIPE,
                           stdout=sb.PIPE,
                           stderr=sb.PIPE,
                           preexec_fn=os.setsid)
        process_Terminated = False
        # The below code snippet handles the case for  taks that take longer then usual(>1 min  to be precise)
        # As we are setting shell=True in Popen call, calling process.kill() may kill the shell itself
        # but not its child process (i.e command itself)
        # As a result we need to assign a session id to the shell(), making it the leader of all the proesses in the group
        # so when we send  'SIGTERM'  singal to group leader(i.e shel), it is propogated to all its children(i.e commands)
        while process.poll() is None:
            elapsed_time = time.time() - start_time
            if elapsed_time > 60:
                logger.info(
                    'Total time elapsed while executing task is {}'.format(
                        elapsed_time))
                process_Terminated = True
                os.killpg(os.getpgid(process.pid), signal.SIGTERM)

        pipe_output = process.communicate()
        command_output = pipe_output[0]
        command_error = pipe_output[1]
        logger.info('{} generated  {}'.format(command_name, command_output))
        duration = (time.time() - start_time)
        '''
		If the output value is empty and 
		the error string is non empty, the executed command errored out
		'''
        if not command_output and command_error:
            command_output = command_error
        '''
		If the process was terminated forcefully,set the output to reflect that
		Also set the duration to 0 to signify that it was terminated abruptly
		'''

        if process_Terminated:
            duration = 0
            logger.debug('The process was terminated due to long running time')
            command_output = 'Process was terminated'
        length = len(command_name)

        new_command = Command(command_name, length, duration,
                              sqlite.Binary(command_output))

        session = Session()
        session.add(new_command)
        session.commit()

    except Exception, e:
        logger.error(
            'Error generated while executing [{}], generated error is {}'.
            format(command_name, str(e)))
Ejemplo n.º 12
0
	def _check_args(self):
		Command._check_args(self)

		if self.args.analysis_name is None:
			self.args.analysis_name, ext = os.path.splitext(os.path.basename(self.args.data_path))

		if self.args.num_samplings < 1:
			self._error("Number of samplings out of range [2, ..)")

		if self.args.mut_gene_threshold < 1:
			self._error("Minimum number of mutations per gene out of range [1, ..)")

		if self.args.mut_pathway_threshold < 1:
			self._error("Minimum number of mutations per pathway out of range [1, ..)")

		if self.args.mapping is not None and not os.path.isfile(self.args.mapping):
			self._error("Pathways mapping file not found: {0}".format(self.args.mapping))
Ejemplo n.º 13
0
def put_to_db(data):
    """
    This functions takes the data and puts to the database
        @param data containing the dictionary
    """
    insertion_list = list()
    for value in data.values():
        insertion_list.append(Command(value[1], value[2], value[4], value[3]))
    session.add_all(insertion_list)
    session.commit()
Ejemplo n.º 14
0
def push_database(comm, length, dur, out):
    # print("This result will be cached in the DataBase")
    # print("Command: "+str(comm))
    # print("Lenght of Command:" + str(length))
    # print("Duration of Command Execution: " + str(dur))
    # print("Output of each Command: "+str(out))
    # print("*********************************************************************")
    row = Command(command_string=comm, length=length, duration=dur, output=out)
    session.add(row)
    session.commit()
    session.close()
Ejemplo n.º 15
0
def process_command_output(queue):
    # TODO: run the command and put its data in the db
    """1.Get each command from the queue and run it in its own shell
       2.Clock the runtine of each command and convert it to nearest second
       3.Check if the command is already there in the db, if not then add it
       to the database."""

    while not queue.empty():
        command = queue.get()
        output_list = []

        ts = time.time()
        output, err = run_command(command)
        obj = json.dumps({'output':output.strip(),'error':err})
        te = time.time()

        elapsed_time = te - ts
        if elapsed_time > 60:
            elapsed_time = 0
        else:
            elapsed_time = math.ceil(elapsed_time)
        
        #Checking for duplicate commands in db
        try:
            cmd_db = session.query(exists().where(Command.command_string==command)).scalar()

        except IntegrityError as e:
            cmd_db = False

        
        #Add it to db if command not present in db
        if not cmd_db:
            try:
                cmd = Command(command,len(command), elapsed_time, obj)
                session.add(cmd)
                session.commit()

            except IntegrityError as e:
                session.rollback()
                print "Integrity Error:" + str(e)

            except SQLalchemyError as e:
                session.rollback()
                print "Could Not insert the data" + str(e) 
Ejemplo n.º 16
0
def process_command_output(queue):
    # TODO: run the command and put its data in the db

    while not queue.empty():

        try:
            file = "/outputs.txt"
            path = os.getcwd() + file
            fp = open(path, 'r+')
            cmd = queue.get()
            command = RunCommand(cmd)

            # Lists to store the data read from output file, eventually we read from these files and put the output in DB
            subCmdOutput = list()
            cmdOutput = list()

            for line in fp:
                subCmdOutput.append(line)

            fp.close()
            cmdOutput.append(subCmdOutput)

            # Set timeout to 1 minute and terminate the child process
            time_taken = command.run(timeout=60)

            string = ""
            for i in cmdOutput:
                string = string.join(i)

            cmd_output = Command(cmd, len(cmd), round(time_taken, 4), string)
            session.add(cmd_output)

        except Exception as ie:
            print ie.message

    # Commit session if everything goes well, else rollback
    try:
        session.commit()
    except Exception as ie:
        session.rollback()
Ejemplo n.º 17
0
def get_command_output():
    """
    Returns as json the command details that have been processed
    ---
    tags: [commands]
    responses:
      200:
        description: Commands returned OK
      400:
        description: Commands not found
    """
    logger.info('inside command query method')
    commands = Session().query(Command).all()
    command_list = list()
    for command in commands:
        command_list.append(
            Command(command.command_string, command.length, command.duration,
                    str(command.output)))
    logger.info(
        'The list of  commands that have been processed are  {}'.format(
            command_list))
    # TODO: format the query result
    return jsonify(eqtls=[cmd_obj.serialize() for cmd_obj in command_list])
Ejemplo n.º 18
0
	def run(self):
		Command.run(self)

		# Load data

		self.log.info("Loading data ...")
		self.log.debug("  > {0}".format(self.args.data_path))

		#TODO: Support loading plain matrices: /file.tsv#name=SIFT

		self.matrix = tdm.load_matrix(self.args.data_path)

		self.log.debug("  {0} rows, {1} columns and {2} slices".format(
		self.matrix.num_rows, self.matrix.num_cols, self.matrix.num_slices))

		# Load filter

		self.filter = LabelFilter()
		if self.args.filter is not None:
			self.log.info("Loading filter ...")
			self.log.debug("  > {0}".format(self.args.filter))

			self.filter.load(self.args.filter)

			self.log.debug("  {0} includes, {1} excludes".format(
				self.filter.include_count, self.filter.exclude_count))

		# Load mapping

		if self.args.mapping is not None:
			self.log.info("Loading mapping ...")
			self.log.debug("  > {0}".format(self.args.mapping))

			self.mapping = self.load_mapping(self.matrix, self.args.mapping, self.filter)

			self.log.debug("  {0} features".format(self.mapping.num_groups))

			method_name = "{0}-{1}".format(self.args.estimator, ZscoreTest.NAME)
		else: # One to one mapping
			map = {}
			for row_name in self.matrix.row_names:
				if self.filter.valid(row_name):
					map[row_name] = (row_name,)
			self.mapping = MatrixMapping(self.matrix, map)
			method_name = "{0}-{1}".format(self.args.estimator, EmpiricalTest.NAME)

		# Get selected slice indices

		if self.args.slices is not None:
			slices = []
			for name in self.args.slices.split(","):
				name = name.strip()
				if name not in self.matrix.slice_name_index:
					self.log.warn("Skipping slice not found: {0}".format(name))
					continue
				slices += [self.matrix.slice_name_index[name]]
		else:
			slices = range(self.matrix.num_slices)

		col_names = [self.matrix.slice_names[i] for i in slices]

		if self.args.save_data:
			for i in slices:
				slice_name = self.matrix.slice_names[i]
				self.log.info("Saving {0} data matrix ...".format(slice_name))
				self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
								 self.matrix.row_names, self.matrix.col_names, self.matrix.data[i],
								 suffix="data-{0}".format(slice_name))

		# Run the analysis

		self.log.info("Running the analysing using '{0}' ...".format(method_name))

		analysis = OncodriveFmAnalysis(
			"oncodrivefm.compute",
			num_samplings = self.args.num_samplings,
			mut_threshold = self.args.mut_threshold,
			num_cores=self.args.num_cores)

		results = analysis.compute(self.matrix, self.mapping, method_name, slices)

		method = create_method(method_name)

		self.log.info("Saving results ...")

		#TODO: Have an option to save in TDM instead of splited
		self.save_splited_results(
			self.args.output_path, self.args.analysis_name, self.args.output_format,
			self.matrix, self.mapping, method, results, slices)
Ejemplo n.º 19
0
def process_command_output(queue):

    # TODO: run the command and put its data in the db

    while not queue.empty():
        command = queue.get()

        # Process commands as you get

        # out_text = ''
        out_byte = None
        output = subprocess.Popen(command,
                                  shell=True,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
        try:
            (stdout, stderr) = output.communicate(timeout=60)
            out_byte = stdout
            # out_text = stdout.decode('us-ascii').rstrip('\n')
        except subprocess.TimeoutExpired:
            # out_text = 'NOT FINISHED'
            pass

        output.stdout.close()
        output.stderr.close()

        # output.terminate()

        outputtime = subprocess.Popen('time ' + command,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        isexpired = 0
        try:
            (stdout, stderr) = outputtime.communicate(timeout=60)
            stderr_text = stderr.decode('us-ascii').rstrip('\n')
        except subprocess.TimeoutExpired:
            isexpired = 1
            pass

        # outputtime.terminate()
        # 0.00user 0.00system 0:00.00elapsed 0%CPU
        timeinseconds = 00
        timeinmilliseconds = 00
        # checks milli seconds and seconds
        if not isexpired:
            if stderr_text[22:24].isnumeric():
                if stderr_text[25:27].isnumeric():
                    timeinmilliseconds = int(stderr_text[25:27])
                    timeinseconds = int(stderr_text[22:24])
                    if timeinmilliseconds > 0:
                        # Rounding to ceil if milliseconds is > 0
                        timeinseconds = timeinseconds + 1
                    timeinseconds = (1 if timeinseconds < 1 else timeinseconds)

        # save processed commands to Database
        # session=Session()

        cmd = Command(command, len(command), timeinseconds, out_byte)
        session.add(cmd)
        session.commit()
Ejemplo n.º 20
0
Archivo: dca.py Proyecto: 50wu/gpdb
 def __init__(self, name, ctxt=LOCAL, remoteHost=None):
     self.cmdStr="/opt/dca/bin/dca_gpdb_stopped"
     Command.__init__(self, name, self.cmdStr, ctxt, remoteHost)
def get_valid_commands(queue, fi, file_data):
    # TODO: efficiently evaluate commands

    # File processing for getting COMMAND LIST and VALID COMMANDS
    script_dir = os.path.dirname(__file__)
    path = os.path.join(script_dir, fi)
    try:
        with open(path, "r") as fh:
           CommandLine = fh.readlines()
    except IOError as err:
        CommandLine = file_data

    commandList=[]
    validList=[]
    for line in CommandLine:
        if '[COMMAND LIST]' in line:
            commandFlag = True
            validFlag = False
        elif '[VALID COMMANDS]' in line:
            commandFlag = False
            validFlag = True

        if '[COMMAND LIST]' not in line and '[VALID COMMANDS]' not in line and line.rstrip() != '':
            if commandFlag :
                commandList.append(line.rstrip())
            elif validFlag :
                validList.append(line.rstrip())

    #Get all the valid commands
    validCommandsFromInput = set(commandList) & set(validList)

    for command in validCommandsFromInput:
        try:

            try:
                start = time.time()
                commandResult = subprocess.check_output(command, shell=True, stderr=STDOUT, timeout=60 )
                CommandTimeTaken = (time.time() - start)
            except subprocess.TimeoutExpired as err:
                print('Handling TimeoutExpired: ', err, ' for command: ', command)
                CommandTimeTaken = 0  # (if the command takes > 1 minute to complete, mark a 0 which will represent "Not finished")
                commandResult = b''

            CommandString = command
            commandLength = len(command)
            print(commandResult)
            print(CommandString)
            print(commandLength)
            print(CommandTimeTaken)

            # Check if the commad is already in Table
            flag = session.query(Command).filter_by(command_string=CommandString).first()

            if flag:
                print('Command : "', command, '" is alredy in commands table')
            else:
                ed_commands = Command(CommandString, commandLength, CommandTimeTaken, commandResult)
                session.add(ed_commands)
                session.commit()
        except subprocess.CalledProcessError as err:
            print('Handling CalledProcessError: ', err, ' for command: ', command)
            continue

    return 200
Ejemplo n.º 22
0
	def __init__(self):
		Command.__init__(self, prog="oncodrivefm", desc="Compute the FM bias for genes and pathways")
Ejemplo n.º 23
0
 def __init__(self, name, ctxt=LOCAL, remoteHost=None):
     self.cmdStr = "/opt/dca/bin/dca_gpdb_stopped"
     Command.__init__(self, name, self.cmdStr, ctxt, remoteHost)
Ejemplo n.º 24
0
	def __init__(self):
		Command.__init__(self, prog="oncodrivefm-compute", desc="Compute the FM bias")
Ejemplo n.º 25
0
	def run(self):
		Command.run(self)

		# Load filter

		self.filter = LabelFilter()
		if self.args.filter is not None:
			self.log.info("Loading filter ...")
			self.log.debug("  > {0}".format(self.args.filter))

			self.filter.load(self.args.filter)

			self.log.debug("  {0} includes, {1} excludes".format(
				self.filter.include_count, self.filter.exclude_count))

		# Load data

		self.log.info("Loading data ...")
		self.log.debug("  > {0}".format(self.args.data_path))

		#TODO: Support loading plain matrices: /file.tsv#slice=SIFT

		self.matrix = tdm.load_matrix(self.args.data_path)

		self.log.debug("  {0} rows, {1} columns and {2} slices".format(
			self.matrix.num_rows, self.matrix.num_cols, self.matrix.num_slices))

		# Get selected slice indices

		if self.args.slices is not None:
			slices = []
			for name in self.args.slices.split(","):
				name = name.strip()
				if name not in self.matrix.slice_name_index:
					raise Exception("Slice not found: {0}".format(name))
				slices += [self.matrix.slice_name_index[name]]
		else:
			slices = range(self.matrix.num_slices)

		col_names = [self.matrix.slice_names[i] for i in slices]

		if self.args.save_data:
			for i in slices:
				slice_name = self.matrix.slice_names[i]
				self.log.info("Saving {0} data matrix ...".format(slice_name))
				self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
							  self.matrix.row_names, self.matrix.col_names, self.matrix.data[i],
							  suffix="data-{0}".format(slice_name))

		# GENES ---------------------------------------

		# One to one mapping for genes

		map = {}
		for row_name in self.matrix.row_names:
			if self.filter.valid(row_name):
				map[row_name] = (row_name,)
		genes_mapping = MatrixMapping(self.matrix, map)
		genes_method_name = "{0}-{1}".format(self.args.estimator, EmpiricalTest.NAME)

		# Analysis for genes

		self.log.info("Analysing genes with '{0}' ...".format(genes_method_name))

		analysis = OncodriveFmAnalysis(
			"oncodrivefm.genes",
			num_samplings = self.args.num_samplings,
			mut_threshold = self.args.mut_gene_threshold,
			num_cores=self.args.num_cores)

		results = analysis.compute(self.matrix, genes_mapping, genes_method_name, slices)

		method = create_method(genes_method_name)

		if self.args.save_analysis:
			self.log.info("Saving genes analysis results ...")
			self.save_splited_results(
				self.args.output_path, self.args.analysis_name, self.args.output_format,
				self.matrix, genes_mapping,
				method, results, slices, suffix="genes")

		# Combination for genes

		self.log.info("Combining analysis results ...")

		combined_results = method.combine(np.ma.masked_invalid(results.T))

		self.log.info("Saving genes combined results ...")
		self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
							 genes_mapping.group_names, method.combination_columns, combined_results.T,
							 params=[("slices", ",".join(col_names)), ("method", method.name)], suffix="genes",
							 valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0)

		if self.args.mapping is None:
			return

		# PATHWAYS ---------------------------------------

		# Load pathways mappping

		self.log.info("Loading pathways mapping ...")
		self.log.debug("  > {0}".format(self.args.mapping))

		pathways_mapping = self.load_mapping(self.matrix, self.args.mapping)

		self.log.debug("  {0} pathways".format(pathways_mapping.num_groups))

		pathways_method_name = "{0}-{1}".format(self.args.estimator, ZscoreTest.NAME)

		# Analysis for pathways

		self.log.info("Analysing pathways with '{0}' ...".format(pathways_method_name))

		analysis = OncodriveFmAnalysis(
			"oncodrivefm.pathways",
			num_samplings = self.args.num_samplings,
			mut_threshold = self.args.mut_pathway_threshold,
			num_cores=self.args.num_cores)

		results = analysis.compute(self.matrix, pathways_mapping, pathways_method_name, slices)

		method = create_method(pathways_method_name)

		if self.args.save_analysis:
			self.log.info("Saving pathways analysis results ...")
			self.save_splited_results(
				self.args.output_path, self.args.analysis_name, self.args.output_format,
				self.matrix, pathways_mapping,
				method, results, slices, suffix="pathways")

		# Combination for pathways

		self.log.info("Combining analysis results ...")

		combined_results = method.combine(np.ma.masked_invalid(results.T))

		self.log.info("Saving pathways combined results ...")
		self.save_matrix(self.args.output_path, self.args.analysis_name, self.args.output_format,
							pathways_mapping.group_names, method.combination_columns, combined_results.T,
							params=[("slices", ",".join(col_names)), ("method", method.name)], suffix="pathways",
							valid_row=lambda row: sum([1 if np.isnan(v) else 0 for v in row]) == 0)
Ejemplo n.º 26
0
	def __init__(self):
		Command.__init__(self, prog="oncodrivefm-combine", desc="Combine FM bias results")