def check_muta_conflict(ctx, chunksize): def get_seq(seq_dict, iso, pos): try: return seq_dict[iso][pos-1] except IndexError: return 'X' custom_db = ctx.obj['custom_db'] root_query = """SELECT DISTINCT isoform, Pos FROM IDMapping, Mutation WHERE IDMapping.ftId = Mutation.ftId AND isoform != 'NaN'""" fetch_iso_seq_query = "SELECT isoform, sequence FROM ALTERNATIVE_PRODUCTS WHERE isoform IN ({}) AND (sequenceStatus = 'displayed' OR sequenceStatus = 'described');" fetch_can_seq_query = "SELECT accession, sequence FROM INFO WHERE accession IN ({}) ;" total = unsync_run(custom_db.database.fetch_val(query=f"SELECT COUNT(*) FROM ({root_query});")) console.log(f"Total {total} to query") with console.status("[bold green]checking..."): for i in range(ceil(total/chunksize)): unp_pos = DataFrame(unsync_run(custom_db.database.fetch_all(query=f"{root_query} LIMIT {chunksize} OFFSET {chunksize*i};")), columns=['isoform', 'Pos']) mask = unp_pos.isoform.str.contains('-') seq_dict = dict(unsync_run(Identifier.sqlite_api.database.fetch_all( query=fetch_iso_seq_query.format(','.join(f"'{ix}'" for ix in set(unp_pos[mask].isoform)) ))) + unsync_run(Identifier.sqlite_api.database.fetch_all( query=fetch_can_seq_query.format(','.join(f"'{ix}'" for ix in set(unp_pos[~mask].isoform))) ))) unp_pos['Ref'] = [get_seq(seq_dict, iso, pos) for iso, pos in zip(unp_pos.isoform, unp_pos.Pos)] custom_db.sync_insert(custom_db.UniProtSeq, unp_pos.to_dict('records')) console.log(f'Done: {len(unp_pos)+chunksize*i}')
def insert_iso_range(ctx, chunksize): def expand_iso_range(res): for UniProt, iso_range in res: iso_range = json.loads(iso_range) for start, end in iso_range: yield dict(UniProt=UniProt, unp_start=start, unp_end=end, resource='iso_range', resource_id=str(start)) custom_db = ctx.obj['custom_db'] proteins_db = Identifier.sqlite_api total = unsync_run(proteins_db.database.fetch_one(query="SELECT COUNT(*) FROM ALTERNATIVE_PRODUCTS WHERE sequenceStatus='described' AND iso_range != 'NaN'"))[0] console.log(f"Total {total} to query") for i in range(ceil(total/chunksize)): res = unsync_run(proteins_db.database.fetch_all( query=f""" SELECT isoform, iso_range FROM ALTERNATIVE_PRODUCTS WHERE sequenceStatus = 'described' AND iso_range != 'NaN' LIMIT {chunksize} OFFSET {chunksize*i} """)) custom_db.sync_insert(custom_db.UniProtAnnotation, tuple(expand_iso_range(res))) console.log(f'Done: {len(res)+chunksize*i}')
def id_mapping(ctx, input, column, sep, chunksize, auto_assign, sleep): sqlite_api = ctx.obj['custom_db'] cols = ('ftId', 'Entry', 'isoform', 'is_canonical') Identifier.auto_assign_when_seq_conflict = auto_assign if input is None: total = unsync_run(sqlite_api.database.fetch_one( query="SELECT COUNT(DISTINCT ftId) FROM Mutation WHERE ftId NOT IN (SELECT DISTINCT ftId FROM IDMapping)"))[0] console.log(f"Total {total} to query") query = f""" SELECT DISTINCT ftId FROM Mutation WHERE ftId NOT IN (SELECT DISTINCT ftId FROM IDMapping) LIMIT {chunksize} """ for index in range(ceil(total/chunksize)): res = unsync_run(sqlite_api.database.fetch_all(query=query)) if len(res) == 0: break with Progress(*progress_bar_args) as p: res = Identifiers(i[0] for i in res).fetch('map2unp').run(p.track).result() values = [dict(zip(cols, i)) for i in res] if values: sqlite_api.sync_insert(sqlite_api.IDMapping, values) console.log(f'Done: {len(res)+chunksize*index}') if sleep: tsleep(uniform(1, 10)) else: if column is None: ids = read_csv(input, sep=sep, header=None)[0].unique() else: ids = read_csv(input, sep=sep, usecols=[column])[column].unique() total = len(ids) console.log(f"Total {total} to query") for index in range(0, total, chunksize): with Progress(*progress_bar_args) as p: res = Identifiers(ids[index:index+chunksize]).fetch('map2unp').run(p.track).result() values = [dict(zip(cols, i)) for i in res] if values: sqlite_api.sync_insert(sqlite_api.IDMapping, values) console.log(f'Done: {len(res)+index}') if sleep: tsleep(uniform(1, 10))
def query_from_DB_with_unps(self, table_name: str, columns: str = '*'): default_tables = ('DB_REFERENCES', 'OTHER_DB_REFERENCES', 'ALTERNATIVE_PRODUCTS', 'FEATURES', 'INTERACTION', 'INFO') assert table_name in default_tables obs = tuple(i for i in self if i.source == 'UniProt') if len(obs) == 0: self.tasks = [] return self accessions = tuple(i.identifier for i in obs) if columns != '*' and table_name == 'INFO': task = Identifier.sqlite_api.database.fetch_all( query= f'SELECT {columns} FROM INFO WHERE accession IN {accessions}') else: task = Identifier.sqlite_api.INFO.objects.filter( accession__in=accessions).all() exists = unsync_run(task) if len(exists) == 0: self.tasks = [ ob.query_from_DB_with_unp(table_name=table_name, columns=columns, exists=False) for ob in obs ] return self else: exist_ids = frozenset(i.accession for i in exists) rest_ids = frozenset(accessions) - exist_ids rest_dfs = [ self[accession].query_from_DB_with_unp(table_name=table_name, columns=columns, exists=False) for accession in rest_ids ] if table_name == 'INFO': ap = unsync_wrap(exists) else: if columns == '*': ap = unsync_wrap( getattr(Identifier.sqlite_api, table_name).objects.filter( accession__in=exist_ids).all()) else: ap = unsync_wrap( Identifier.sqlite_api.database.fetch_all( query= f'SELECT {columns} FROM {table_name} WHERE accession IN {tuple(exist_ids)}' )) rest_dfs.append(ap) self.tasks = rest_dfs return self
def sifts_mapping(ctx, input, column, sep, func, kwargs, chunksize, entry_filter, chain_filter, skip_pdbs, omit, output, iteroutput, sleep): def get_unp_id(args): Entry, isoform, is_canonical = args return Entry if is_canonical else isoform kwargs = dict(sub.split('=') for item in kwargs for sub in item.split(';')) if len(kwargs) > 0: for key,value in kwargs.items(): kwargs[key] = eval(value) console.log(f"take args: {kwargs}") skip_pdbs = [pdbi for item in skip_pdbs for pdbi in item.split(',')] if skip_pdbs: kwargs['skip_pdbs'] = skip_pdbs SIFTS.entry_filter = entry_filter SIFTS.chain_filter = chain_filter sqlite_api = ctx.obj['custom_db'] output = f'{func}.tsv' if output == '' else output output_path = ctx.obj['folder']/output if input is None: total = unsync_run(sqlite_api.database.fetch_one( query="SELECT COUNT(DISTINCT isoform) FROM IDMapping WHERE isoform != 'NaN'"))[0] - omit console.log(f"Total {total} to query") for i in range(ceil(total/chunksize)): res = unsync_run(sqlite_api.database.fetch_all( query=f""" SELECT DISTINCT Entry,isoform,is_canonical FROM IDMapping WHERE isoform != 'NaN' LIMIT {chunksize} OFFSET {omit+chunksize*i} """)) with Progress(*progress_bar_args) as p: res = SIFTSs(map(get_unp_id, res)).fetch(func, **kwargs).run(p.track).result() for dfrm in res: if dfrm is None: continue dfrm[sorted(dfrm.columns)].to_csv(output_path, sep='\t', index=False, header=not output_path.exists(), mode='a+') console.log(f'Done: {len(res)+chunksize*i}') #if len(res) < chunksize: # break if sleep and len(res) == chunksize: tsleep(uniform(1, 10)) else: if column is None: ids = read_csv(input, sep=sep, header=None, skiprows=omit if omit > 0 else None)[0].unique() else: ids = read_csv(input, sep=sep, usecols=[column], skiprows=omit if omit > 0 else None)[column].unique() total = len(ids) console.log(f"Total {total} to query") for i in range(0, total, chunksize): with Progress(*progress_bar_args) as p: res = SIFTSs(ids[i:i+chunksize]).fetch(func, **kwargs).run(p.track).result() if iteroutput: for dfrm in res: if dfrm is None: continue elif isinstance(dfrm, DataFrame): dfrm[sorted(dfrm.columns)].to_csv(output_path, sep='\t', index=False, header=not output_path.exists(), mode='a+') else: pass else: DataFrame(res).to_csv(output_path, sep='\t', index=False, header=False, mode='a+') console.log(f'Done: {i+len(res)}') #if len(res) < chunksize: # break if sleep and len(res) == chunksize: tsleep(uniform(1, 10))
>>> writer( reader(f'http://www.ebi.ac.uk/pdbe/static/entry/download/{header}.cif.gz'), f'{header}-pdbe_chain_remapping.cif', b'data_%s\n#\nloop_\n' % bytes(header, 'utf-8'), b'_pdbe_chain_remapping' ).result() >>> parser( reader(f'http://www.ebi.ac.uk/pdbe/static/entry/download/{header}.cif.gz'), ('data_%s\n' % header, '#\n', 'loop_\n'), b'_pdbe_chain_remapping' ) >>> ''' semaphore = unsync_run(init_semaphore(10)) def iter_index(text, target, add): ''' >>> text = b'sdgfsd\nsdgsdg\nfdsg\nd' >>> index = (None, *iter_index(text), None) >>> print(index) >>> tuple(text[start:end] for start,end in zip(index, index[1:])) ''' text_len = len(text) start = -1 while True: try: res = text.index(target, start + 1) + add yield res