def cross_test(sqlite_cmdline, codec): (query_bytes, expected_bytes) = test_query() (official_output, official_error) = call_program(sqlite_cmdline, query_bytes) # We can't use os.linesep here since binaries may belong to different platforms (Win32/MinGW vs. MSYS/Cygwin vs. WSL...) official_output = official_output.replace(b"\r\n", b"\n") official_error = official_error.replace(b"\r\n", b"\n") if official_output != expected_bytes: raise sqlite3.ProgrammingError("expected bytes are wrong: official %s != expected %s" % (repr(official_output), repr(expected_bytes))) if official_error: raise sqlite3.ProgrammingError("did not expect errors from official binary")
def from_database_id(cls, record_id, db_cursor): """ Create a SongChunk instance from data already stored in the database. Does NOT load spectrogram into memory, only the archipelagos and data about where the spectrogram image is stored are loaded into the instance :param record_id: The RecordID of the SongChunk in the database that should be turned into an instance :param db_cursor: A connection to the SQLite3 database :return: A SongChunk instance which has its specname, archipelagos, and spec_in_memory fields populated using the data in the database pointed to by db_cursor """ chunk = SongChunk() # Get the specpath db_cursor.execute( "SELECT SpecPath FROM chunks where RecordID={}".format(record_id)) path = db_cursor.fetchone() # Couldn't find chunk with RecordID record_id in the database if path is None: raise sqlite3.ProgrammingError( "The RecordID ({}) of the chunk you tried to load wasn't" " found in the database".format(record_id)) chunk.specname = path[0] # Fetch if the spectrogram has actually been stored at the Specpath db_cursor.execute( "SELECT SpecWritten FROM chunks where RecordID={}".format( record_id)) chunk.spec_in_memory = bool(db_cursor.fetchone()[0]) # Check if there are archipelagos for the chunk db_cursor.execute( "SELECT NumACP FROM chunks where RecordID={}".format(record_id)) num_acp = db_cursor.fetchone()[0] if num_acp: # Fetch the archipelagos if there are supposed to be archipelagos db_cursor.execute( "SELECT ArchID FROM archs where ParentChunk={}".format( record_id)) archs = db_cursor.fetchall() if len(archs) != num_acp: raise sqlite3.ProgrammingError( "Did not find the expected number of archipelagos in the database. " "Expected: {}, Found: {}".format(num_acp, len(archs))) for arch in archs: chunk.archipelagos.append( DenseArchipelago.from_database_id(arch[0], db_cursor)) chunk._archipelagos_initialized = True db_cursor.execute( "SELECT Height, Width FROM chunks where RecordID={}".format( record_id)) chunk.height, chunk.width = db_cursor.fetchone() return chunk
def self_test(codec): (query_bytes, expected_bytes) = test_query() if not (lambda stdin, stdout, stderr: not main( sys.argv[0], stdin=stdin, stdout=stdout, stderr=stderr) and stdout.getvalue() == expected_bytes)(io.BytesIO(query_bytes), io.BytesIO(), io.BytesIO()): raise sqlite3.ProgrammingError("byte I/O is broken") if not (lambda stdin, stdout, stderr: not main( sys.argv[0], stdin=stdin, stdout=stdout, stderr=stderr) and stdout. getvalue() == codec.decode(expected_bytes, 'surrogateescape'))( io.StringIO(query_bytes.decode(ascii)), io.StringIO(), io.StringIO()): raise sqlite3.ProgrammingError("string I/O is broken")
def test_sqlite_close(self, mock_graph): # make sure this wierd but harmless sqlite3 exception is # caught mock_graph.return_value.close.side_effect = sqlite3.ProgrammingError( "You made a wrong") store = TripleStore.connect("SQLITE", "", "") store.close()
def __init__(self): self.__logger = Logger() self._request_exceptions = [type(item) for item in [requests.ConnectionError(), requests.HTTPError(), requests.TooManyRedirects(), requests.Timeout(), requests.TooManyRedirects(), requests.RequestException(), requests.ConnectTimeout(), requests.ReadTimeout()]] self._system_errors = [type(item) for item in [KeyError(), AttributeError(), IndexError(), ZeroDivisionError(), SystemError(), ValueError(), AssertionError()]] self._file_errors = [type(item) for item in [FileExistsError(), FileNotFoundError()]] self._database_errors = [type(item) for item in [sqlite3.Error(), sqlite3.DataError(), sqlite3.ProgrammingError(), sqlite3.DatabaseError(), sqlite3.NotSupportedError(), sqlite3.IntegrityError(), sqlite3.InterfaceError(), sqlite3.InternalError(), sqlite3.OperationalError()]] self._speech_recognizer_errors = [type(item) for item in [sr.RequestError(), sr.UnknownValueError(), sr.WaitTimeoutError(), sr.RequestError()]] self.__logger.info('ExceptionsHandler was successfully initialized.', __name__)
def get_columns(cursor, table): """Returns list of column names used in table.""" cursor.execute('PRAGMA table_info({0})'.format(table)) columns = [x[1] for x in cursor] if not columns: raise sqlite3.ProgrammingError('no such table: {0}'.format(table)) return columns
def in_transaction(self): """Analogous to :any:`sqlite3.Connection.in_transaction`""" if self.connection is not None: return self.connection.in_transaction raise sqlite3.ProgrammingError("Cannot operate on a closed database.")
def _execute(self, cmd, *args): """执行sql命令,返回结果""" try: cursor = self._conn.cursor() return cursor.execute(cmd, *args) except AttributeError: raise sqlite3.ProgrammingError('Cannot operate on ' 'a closed database.')
def _cursor(self): try: return self._conn.cursor() except sqlite3.ProgrammingError as e: if not self._cursor_fail: self._cursor_fail = True self._conn = sqlite3.connect(self.db_path) return self._conn.cursor() else: raise sqlite3.ProgrammingError(e)
def cursor(self): """Analogous to :any:`sqlite3.Connection.cursor`""" if self.single_cursor_mode: if self._cursor is None: raise sqlite3.ProgrammingError( "Cannot operate on a closed database.") return self._cursor return Cursor(self)
def patched_execute(self, sql, *args, **kwargs): nonlocal count, sql_seen if sql == fail_on_sql: sql_seen = True if count == fail_on_count: raise sqlite3.ProgrammingError("Uh oh") count += 1 return execute(self, sql, *args, **kwargs)
def executemany(self, cmd: str, arg_iter): """迭代执行sql命令, 返回结果 cmd: sql语句 arg_iter: 由完整执行一次sql命令所需参数所组成的迭代器 """ try: cursor = self._conn.cursor() return cursor.executemany(cmd, arg_iter) except AttributeError: raise sqlite3.ProgrammingError('Cannot operate on ' 'a closed database.')
def from_database_id(cls, arch_id, db_connection): """ Load in a DenseArchipelago from the database :param arch_id: The ArchID in the database which corresponds to the DenseArchipelago we will be loading :param db_connection: The connection to the SQLite3 database :return: A DenseArchipelago instance which corresponds exactly to the archipelago with ArchID=arch_id in the database pointed to by db_connection """ load_arch = DenseArchipelago() # Get the bounding box db_connection.execute( "SELECT LeftBd FROM archs WHERE ArchID={}".format(arch_id)) left_bd = db_connection.fetchone() if left_bd is None: raise sqlite3.ProgrammingError( "The archipelago with ArchId={} isn't in the database".format( arch_id)) load_arch.left_bd = left_bd[0] db_connection.execute( "SELECT RightBd FROM archs WHERE ArchID={}".format(arch_id)) load_arch.right_bd = db_connection.fetchone()[0] db_connection.execute( "SELECT UpBd FROM archs WHERE ArchID={}".format(arch_id)) load_arch.upper_bd = db_connection.fetchone()[0] db_connection.execute( "SELECT LowBd FROM archs WHERE ArchID={}".format(arch_id)) load_arch.lower_bd = db_connection.fetchone()[0] # Load all of the land into the archipelago db_connection.execute( "SELECT X, Y FROM land WHERE ParentArchipelago={}".format(arch_id)) lands = db_connection.fetchall() if not lands: raise sqlite3.ProgrammingError( "Unable to find land associated with ArchId={} in database". format(arch_id)) for land_piece in lands: load_arch.land.append(land_piece) return load_arch
def insert_records(cursor, table, columns, records): table = normalize_names(table) columns = normalize_names(columns) sql = 'INSERT INTO {0} ({1}) VALUES ({2})'.format( table, ', '.join(columns), ', '.join(['?'] * len(columns)), ) try: cursor.executemany(sql, records) except sqlite3.ProgrammingError as error: if 'incorrect number of bindings' in str(error).lower(): msg = ( '{0}\n\nThe records {1!r} contains some rows with too ' 'few or too many values. Before loading this data, it ' 'must be normalized so each row contains a number of ' 'values equal to the number of columns being loaded.').format( error, records) error = sqlite3.ProgrammingError(msg) error.__cause__ = None raise error
def getValue(cursor, tableName, desiredColumn, searchColumn, searchValue, getMultiple=False, sortedResults=False, sortBy="", descending=False): selectDataString = "SELECT " + desiredColumn + " FROM " + tableName selectDataString += " WHERE " + searchColumn + "=?" if sortedResults == True: # Sort the results by the given colum in the call requests it # The order is ascending by default selectDataString += f"ORDER BY {sortBy} " if descending == True: # Sort the results in descending order selectDataString += f"DESC" cursor.execute(selectDataString, (searchValue, )) # Assigns the list of tuples given by fetchall() [or None] to # a variable so it can be accessed multiple times returnList = cursor.fetchall() # If there was no match, then return None before trying, # raise an error explaining if (len(returnList) == 0): raise sqlite3.ProgrammingError( f"{desiredColumn} for {searchColumn} {searchValue} was not found") else: # We need to convert the list of tuples into a list of values for index in range(0, len(returnList)): returnList[index] = returnList[index][0] if getMultiple == True: # Return the whole list if the function call asks for multiple values return returnList else: # This returns the first value of the list (the desired value) return returnList[0]
def perform_check(keys, shelve_stacks, args, logger): """ This is the most important method. After preparing the data structure, this function creates the real transcript instances and checks that they are correct when looking at the underlying genome sequence. This is also the point at which we start using multithreading, if so requested. :param keys: sorted list of [tid, sequence] :param shelve_stacks: dictionary containing the name and the handles of the shelf DBs :param args: the namespace :param logger: logger :return: """ counter = 0 # FASTA extraction *has* to be done at the main process level, it's too slow # to create an index in each process. if args.json_conf["prepare"]["single"] is True or args.json_conf["threads"] == 1: # Use functools to pre-configure the function # with all necessary arguments aside for the lines partial_checker = functools.partial( create_transcript, canonical_splices=args.json_conf["prepare"]["canonical"], logger=logger, force_keep_cds=not args.json_conf["prepare"]["strip_cds"]) for tid, chrom, key in keys: tid, shelf_name = tid try: tobj = json.loads(next(shelve_stacks[shelf_name]["cursor"].execute( "SELECT features FROM dump WHERE tid = ?", (tid,)))[0]) except sqlite3.ProgrammingError as exc: raise sqlite3.ProgrammingError("{}. Tids: {}".format(exc, tid)) if chrom not in args.json_conf["reference"]["genome"].references: raise KeyError("Invalid chromosome name! {}, {}, {}, {}".format(tid, shelf_name, chrom, key)) transcript_object = partial_checker( tobj, str(args.json_conf["reference"]["genome"].fetch(chrom, key[0]-1, key[1])), key[0], key[1], lenient=args.json_conf["prepare"]["lenient"], is_reference=tobj["is_reference"], strand_specific=tobj["strand_specific"]) if transcript_object is None: continue counter += 1 if counter >= 10**4 and counter % (10**4) == 0: logger.info("Retrieved %d transcript positions", counter) elif counter >= 10**3 and counter % (10**3) == 0: logger.debug("Retrieved %d transcript positions", counter) print(transcript_object.format("gtf"), file=args.json_conf["prepare"]["files"]["out"]) print(transcript_object.fasta, file=args.json_conf["prepare"]["files"]["out_fasta"]) else: # pylint: disable=no-member # submission_queue = multiprocessing.JoinableQueue(-1) batches = list(enumerate(keys, 1)) # np.random.shuffle(batches) random.shuffle(batches) kwargs = { "fasta_out": os.path.basename(args.json_conf["prepare"]["files"]["out_fasta"].name), "gtf_out": os.path.basename(args.json_conf["prepare"]["files"]["out"].name), "tmpdir": args.tempdir.name, "seed": args.json_conf["seed"], "lenient": args.json_conf["prepare"]["lenient"], "canonical_splices": args.json_conf["prepare"]["canonical"], "force_keep_cds": not args.json_conf["prepare"]["strip_cds"], "log_level": args.level } working_processes = [] for idx, batch in enumerate(np.array_split(batches, args.json_conf["threads"]), 1): batch_file = tempfile.NamedTemporaryFile(delete=False, mode="wb") msgpack.dump(batch.tolist(), batch_file) batch_file.flush() batch_file.close() proc = CheckingProcess( batch_file.name, args.logging_queue, args.json_conf["reference"]["genome"].filename, idx, shelve_stacks.keys(), **kwargs) proc.start() working_processes.append(proc) [_.join() for _ in working_processes] partial_gtf = [os.path.join(args.tempdir.name, "{0}-{1}".format( os.path.basename(args.json_conf["prepare"]["files"]["out"].name), _ + 1)) for _ in range(args.json_conf["threads"])] merge_partial(partial_gtf, args.json_conf["prepare"]["files"]["out"]) partial_fasta = [os.path.join( args.tempdir.name, "{0}-{1}".format(os.path.basename(args.json_conf["prepare"]["files"]["out_fasta"].name), _ + 1)) for _ in range(args.json_conf["threads"])] merge_partial(partial_fasta, args.json_conf["prepare"]["files"]["out_fasta"]) args.json_conf["prepare"]["files"]["out_fasta"].close() args.json_conf["prepare"]["files"]["out"].close() logger.setLevel(logging.INFO) # logger.info("Finished to analyse %d transcripts (%d retained)", # len(exon_lines), counter) logger.setLevel(args.level) return
def execute(*args): raise sqlite3.ProgrammingError('unexpected error')
def exception_handling(e): raise sqlite3.ProgrammingError( str(e) + '\nreq = “{}”\nparams = “{}”'.format(req, params))
def perform_check(keys, shelve_stacks, args, logger): """ This is the most important method. After preparing the data structure, this function creates the real transcript instances and checks that they are correct when looking at the underlying genome sequence. This is also the point at which we start using multithreading, if so requested. :param keys: sorted list of [tid, sequence] :param shelve_stacks: dictionary containing the name and the handles of the shelf DBs :param args: the namespace :param logger: logger :return: """ counter = 0 # FASTA extraction *has* to be done at the main process level, it's too slow # to create an index in each process. if args.json_conf["prepare"]["single"] is True or args.json_conf[ "prepare"]["procs"] == 1: # Use functools to pre-configure the function # with all necessary arguments aside for the lines partial_checker = functools.partial( create_transcript, canonical_splices=args.json_conf["prepare"]["canonical"], logger=logger, force_keep_cds=not args.json_conf["prepare"]["strip_cds"]) for tid, chrom, key in keys: tid, shelf_name = tid try: tobj = json.loads( next(shelve_stacks[shelf_name]["cursor"].execute( "SELECT features FROM dump WHERE tid = ?", (tid, )))[0]) except sqlite3.ProgrammingError as exc: raise sqlite3.ProgrammingError("{}. Tids: {}".format(exc, tid)) transcript_object = partial_checker( tobj, str(args.json_conf["reference"]["genome"][chrom][key[0] - 1:key[1]]), key[0], key[1], lenient=args.json_conf["prepare"]["lenient"], is_reference=tobj["is_reference"], strand_specific=tobj["strand_specific"]) if transcript_object is None: continue counter += 1 if counter >= 10**4 and counter % (10**4) == 0: logger.info("Retrieved %d transcript positions", counter) elif counter >= 10**3 and counter % (10**3) == 0: logger.debug("Retrieved %d transcript positions", counter) print(transcript_object.format("gtf"), file=args.json_conf["prepare"]["files"]["out"]) print(transcript_object.fasta, file=args.json_conf["prepare"]["files"]["out_fasta"]) else: # pylint: disable=no-member submission_queue = multiprocessing.Queue(-1) working_processes = [ CheckingProcess( submission_queue, args.logging_queue, args.json_conf["reference"]["genome"].filename, _ + 1, os.path.basename( args.json_conf["prepare"]["files"]["out_fasta"].name), os.path.basename( args.json_conf["prepare"]["files"]["out"].name), args.tempdir.name, lenient=args.json_conf["prepare"]["lenient"], canonical_splices=args.json_conf["prepare"]["canonical"], log_level=args.level) for _ in range(args.json_conf["prepare"]["procs"]) ] [_.start() for _ in working_processes] for counter, keys in enumerate(keys): tid, chrom, (pos) = keys tid, shelf_name = tid tobj = json.loads( next(shelve_stacks[shelf_name]["cursor"].execute( "SELECT features FROM dump WHERE tid = ?", (tid, )))[0]) submission_queue.put((tobj, pos[0], pos[1], counter + 1)) submission_queue.put(tuple(["EXIT"] * 4)) [_.join() for _ in working_processes] partial_gtf = [ os.path.join( args.tempdir.name, "{0}-{1}".format( os.path.basename( args.json_conf["prepare"]["files"]["out"].name), _ + 1)) for _ in range(args.json_conf["prepare"]["procs"]) ] merge_partial(partial_gtf, args.json_conf["prepare"]["files"]["out"]) partial_fasta = [ os.path.join( args.tempdir.name, "{0}-{1}".format( os.path.basename( args.json_conf["prepare"]["files"]["out_fasta"].name), _ + 1)) for _ in range(args.json_conf["prepare"]["procs"]) ] merge_partial(partial_fasta, args.json_conf["prepare"]["files"]["out_fasta"]) args.json_conf["prepare"]["files"]["out_fasta"].close() args.json_conf["prepare"]["files"]["out"].close() logger.setLevel(logging.INFO) # logger.info("Finished to analyse %d transcripts (%d retained)", # len(exon_lines), counter) logger.setLevel(args.level) return
def run(self): """Start polling the queue, analyse the loci, and send them to the printer process.""" self.logger.debug("Starting to parse data for {0}".format(self.name)) current_chrom = None # Read-only connection conn = sqlite3.connect("file:{}?mode=ro".format( os.path.join(self._tempdir, "temp_store.db")), uri=True, isolation_level="DEFERRED", timeout=60, check_same_thread=False) cursor = conn.cursor() while True: counter = self.locus_queue.get()[0] if counter == "EXIT": self.logger.debug("EXIT received for %s", self.name) self.locus_queue.task_done() self.locus_queue.put((counter, )) self.__close_handles() break # self.join() else: assert isinstance(counter, int), type(counter) try: transcripts = cursor.execute( "SELECT json FROM transcripts WHERE counter=?", (str(counter), )).fetchone() except sqlite3.ProgrammingError as exc: self.logger.exception( sqlite3.ProgrammingError( (exc, counter, str(counter), (str(counter), )))) self.__close_handles() break if transcripts is None: raise KeyError("Nothing found in the database for %s", counter) transcripts = json.loads(transcripts[0]) if len(transcripts) == 0: stranded_loci = [] else: tobjects = [] for tjson in transcripts: transcript = Transcript(logger=self.logger) transcript.load_dict(tjson) tobjects.append(transcript) slocus = Superlocus(tobjects.pop(), stranded=False, json_conf=self.json_conf, source=self.json_conf["pick"] ["output_format"]["source"]) while len(tobjects) > 0: slocus.add_transcript_to_locus(tobjects.pop(), check_in_locus=False) if current_chrom != slocus.chrom: self.__gene_counter = 0 current_chrom = slocus.chrom if self.regressor is not None: slocus.regressor = self.regressor stranded_loci = self.analyse_locus(slocus, counter) for stranded_locus in stranded_loci: self.__gene_counter = print_locus(stranded_locus, self.__gene_counter, self._handles, counter=counter, logger=self.logger, json_conf=self.json_conf) self.locus_queue.task_done() return
def database_retrieval(): ''' Retrieves information from the database. Fetches data from a D.B. Columns: 'name_of_method_to_test', 'input_data', 'expected_data', 'perform_test' (boolean), ... If the database doesn't exist, it creates is, adds the tables, and prompts the user a message telling it must be populated with data. Output: A list of tuples (rows) with the values of 'name_of_method_to_test', 'data_input', 'expected_output'. ''' table_attributes = { 'database_filename': name_of_module_to_test + '.sqlite3', 'table_name': 'Test', 'method_column': 'name_of_method_to_test', 'input_column': 'data_input', 'output_column': 'expected_output', 'is_test_performed_column': 'perform_test', 'comment_column': 'comment' } connection = sqlite3.connect(table_attributes['database_filename']) try: cursor = connection.cursor() # Check if the table exists cursor.execute( ''' SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name = ? ''', (table_attributes['table_name'], )) if int(cursor.fetchone()[0]) != 1: cursor.execute(''' CREATE TABLE {table_name} ({method_column} TEXT, {input_column} TEXT, {output_column} TEXT, {is_test_performed_column} INTEGER DEFAULT 1 /* Pity there's no BOOLEAN in SQLite */, {comment_column}) '''.format(**table_attributes)) raise sqlite3.ProgrammingError(''' The database didn't exist. Creating a new database... Database file '{database_filename}' has been created. Fill the table '{table_name}' with data and execute this program again to run the tests: In the '{method_column}' column you should type the name of the method to test. This method must accept one single parameter of type string. E.G: frequent_words In '{input_column}' the string that will be passed to the method when called. It can have multiple lines, E.G: ACGTTGCATGTCGCATGATGCATGAGAGCT 4 In '{output_column}' the string that the method should return if it is correct. E.G: CATG GCAT '''.format(**table_attributes)) else: cursor.execute(''' SELECT {method_column}, {input_column}, {output_column}, {comment_column} FROM {table_name} WHERE {is_test_performed_column} = 1 '''.format( **table_attributes)) result = cursor.fetchall() finally: connection.close() return result
def run(self): """Start polling the queue, analyse the loci, and send them to the printer process.""" self.logger.debug("Starting to parse data for {0}".format(self.name)) # Read-only connection conn = sqlite3.connect("file:{}?mode=ro".format(os.path.join(self._tempdir, "temp_store.db")), uri=True, # Necessary to use the Read-only mode from file string isolation_level="DEFERRED", timeout=60, check_same_thread=False # Necessary for SQLite3 to function in multiprocessing ) cursor = conn.cursor() print_cds = (not self.json_conf["pick"]["run_options"]["exclude_cds"]) print_monoloci = (self.json_conf["pick"]["files"]["monoloci_out"] != "") print_subloci = (self.json_conf["pick"]["files"]["subloci_out"] != "") while True: counter = self.locus_queue.get()[0] if counter == "EXIT": self.logger.debug("EXIT received for %s", self.name) self.locus_queue.task_done() self.locus_queue.put((counter, )) break else: try: transcripts = cursor.execute( "SELECT json FROM transcripts WHERE counter=?", (str(counter),)).fetchone() except sqlite3.ProgrammingError as exc: self.logger.exception(sqlite3.ProgrammingError((exc, counter, str(counter), (str(counter),)))) # self.__close_handles() break if transcripts is None: raise KeyError("Nothing found in the database for %s", counter) transcripts = msgpack.loads(transcripts[0], raw=False) if len(transcripts) == 0: stranded_loci = [] self.logger.warning("No transcript found for index %d", counter) else: tobjects = [] chroms = set() for tjson in transcripts: definition = GtfLine(tjson["definition"]).as_dict() is_reference = definition["source"] in self.json_conf["prepare"]["files"]["reference"] transcript = Transcript(logger=self.logger, source=definition["source"], intron_range=self.json_conf["pick"]["run_options"]["intron_range"], is_reference=is_reference) transcript.chrom, transcript.start, transcript.end = (definition["chrom"], definition["start"], definition["end"]) chroms.add(transcript.chrom) assert len(chroms) == 1, chroms try: transcript.id = definition["transcript"] except KeyError: raise KeyError(definition) transcript.strand, transcript.feature = definition["strand"], definition["feature"] transcript.attributes = definition["attributes"] try: for exon in tjson["exon_lines"]: start, end, feature, phase = exon transcript.add_exon((start, end), feature=feature, phase=phase) transcript.finalize() tobjects.append(transcript) except InvalidTranscript as exc: self.logger.exception("Transcript %s is invalid. Ignoring. Error: %s", transcript.id, exc) slocus = Superlocus(tobjects.pop(), stranded=False, json_conf=self.json_conf, source=self.json_conf["pick"]["output_format"]["source"]) while len(tobjects) > 0: slocus.add_transcript_to_locus(tobjects.pop(), check_in_locus=False) if self.regressor is not None: slocus.regressor = self.regressor stranded_loci = self.analyse_locus(slocus, counter) serialise_locus(stranded_loci, self.dump_conn, counter, print_cds=print_cds, print_monosubloci=print_monoloci, print_subloci=print_subloci) if len(stranded_loci) == 0: self.logger.warning("No loci left for index %d", counter) self.status_queue.put(counter) self.locus_queue.task_done() return
def perform_check(keys, shelve_names, mikado_config: MikadoConfiguration, logger): """ This is the most important method. After preparing the data structure, this function creates the real transcript instances and checks that they are correct when looking at the underlying genome sequence. This is also the point at which we start using multithreading, if so requested. :param keys: sorted list of [tid, sequence] :param shelve_names: list of the temporary files. :param mikado_config: MikadoConfiguration :param logger: logger :return: """ counter = 0 # FASTA extraction *has* to be done at the main process level, it's too slow # to create an index in each process. if mikado_config.prepare.single is True or mikado_config.threads == 1: shelve_stacks = dict( (shelf, open(shelf, "rb")) for shelf in shelve_names) # Use functools to pre-configure the function # with all necessary arguments aside for the lines partial_checker = functools.partial( create_transcript, canonical_splices=mikado_config.prepare.canonical, codon_table=mikado_config.serialise.codon_table, logger=logger, strip_faulty_cds=mikado_config.prepare.strip_faulty_cds) for tid, chrom, key in keys: tid, shelf_name, write_start, write_length = tid try: shelf = shelve_stacks[shelf_name] shelf.seek(write_start) tobj = msgpack.loads(zlib.decompress( (shelf.read(write_length))), raw=False) except sqlite3.ProgrammingError as exc: raise sqlite3.ProgrammingError("{}. Tids: {}".format(exc, tid)) if chrom not in mikado_config.reference.genome.references: raise KeyError( "Invalid chromosome name! {}, {}, {}, {}".format( tid, shelf_name, chrom, key)) try: seq = str( mikado_config.reference.genome.fetch( chrom, key[0] - 1, key[1])) except ValueError: raise ValueError(tobj) transcript_object = partial_checker( tobj, seq, key[0], key[1], lenient=mikado_config.prepare.lenient, is_reference=tobj["is_reference"], strand_specific=tobj["strand_specific"]) if transcript_object is None: continue counter += 1 if counter >= 10**4 and counter % (10**4) == 0: logger.info("Retrieved %d transcript positions", counter) elif counter >= 10**3 and counter % (10**3) == 0: logger.debug("Retrieved %d transcript positions", counter) print(transcript_object.format("gtf"), file=mikado_config.prepare.files.out) print(transcript_object.fasta, file=mikado_config.prepare.files.out_fasta) else: # pylint: disable=no-member # submission_queue = multiprocessing.JoinableQueue(-1) batches = list(enumerate(keys, 1)) # np.random.shuffle(batches) random.shuffle(batches) kwargs = { "fasta_out": os.path.basename(mikado_config.prepare.files.out_fasta.name), "gtf_out": os.path.basename(mikado_config.prepare.files.out.name), "tmpdir": mikado_config.tempdir.name, "seed": mikado_config.seed, "lenient": mikado_config.prepare.lenient, "canonical_splices": mikado_config.prepare.canonical, "strip_faulty_cds": mikado_config.prepare.strip_faulty_cds, "codon_table": mikado_config.serialise.codon_table, "log_level": mikado_config.log_settings.log_level } working_processes = [] batch_files = [] for idx, batch in enumerate( np.array_split(np.array(batches, dtype=object), mikado_config.threads), 1): batch_file = tempfile.NamedTemporaryFile(delete=True, mode="wb") msgpack.dump(batch.tolist(), batch_file) batch_file.flush() batch_files.append(batch_file) proc = CheckingProcess(batch_file.name, mikado_config.logging_queue, mikado_config.reference.genome.filename, idx, shelve_names, **kwargs) try: proc.start() except TypeError as exc: logger.critical("Failed arguments: %s", (batch_file.name, mikado_config.logging_queue, mikado_config.reference.genome.filename, idx, shelve_names)) logger.critical("Failed kwargs: %s", kwargs) logger.critical(exc) raise working_processes.append(proc) [_.join() for _ in working_processes] partial_gtf = [ os.path.join( mikado_config.tempdir.name, "{0}-{1}".format( os.path.basename(mikado_config.prepare.files.out.name), _ + 1)) for _ in range(mikado_config.threads) ] merge_partial(partial_gtf, mikado_config.prepare.files.out) partial_fasta = [ os.path.join( mikado_config.tempdir.name, "{0}-{1}".format( os.path.basename( mikado_config.prepare.files.out_fasta.name), _ + 1)) for _ in range(mikado_config.threads) ] merge_partial(partial_fasta, mikado_config.prepare.files.out_fasta) [batch_file.close() for batch_file in batch_files] mikado_config.prepare.files.out_fasta.close() mikado_config.prepare.files.out.close() logger.setLevel(logging.INFO) # logger.info("Finished to analyse %d transcripts (%d retained)", # len(exon_lines), counter) logger.setLevel(mikado_config.log_settings.log_level) return