def tissues_x_proteins(self, normalized=True, tissues=None): ''' For all tissues downloads the expression of all the proteins. In the result, a dict of dicts will hold the expression values of each proteins, grouped by samples. ''' self.get_tissues() tissues_selected = set([ t['TISSUE_ID'] for t in self.tissues if tissues is None or t['TISSUE_ID'] in tissues ]) - self.tissues_loaded prg = progress.Progress( len(tissues_selected), 'Downloading expression data', 1, percent=False) for tis in tissues_selected: prg.step() sys.stdout.write('Querying tissue %s\n' % tis) sys.stdout.flush() self.get_proteins(tis) if not hasattr(self.result, 'read'): sys.stdout.write('\tFailed: %s\n' % tis) sys.stdout.flush() else: self.tissues_loaded.add(tis) self.get_expression(normalized) if tis not in self.samples: self.samples[tis] = [] self.samples[tis] = uniq_list(self.samples[tis] + list( self.current_samples)) self.current_samples = set([]) prg.terminate()
def smiles2chembl(self, smiles): self.result = {} prg = progress.Progress(total=len(smiles), name='Translating SMILEs', interval=1) for sml in smiles: url = self.chembl_url.format(sml) c = curl.Curl(url, large=False) result = c.result self.result[sml] = [] if result is not None: try: data = json.loads(result) for d in data['compounds']: this_smile = d['smiles'] this_chembl = d['chemblId'] # if this_smile == sml: self.result[sml].append(this_chembl) except ValueError: soup = bs4.BeautifulSoup(result) compounds = soup.find_all('compound') if compounds is not None: for compound in compounds: this_smile = compound.find('smiles').text this_chembl = compound.find('chemblid').text # if this_smile == sml: self.result[sml].append(this_chembl) prg.step() prg.terminate()
def load_collection(self, collname, id_type='entrez', map_ids=True, cachedir='cache'): if os.path.exists(os.path.join(cachedir, 'gsea-%s.pickle' % collname)): self.load([collname]) return None url = self.collections[collname]['urls'][id_type] data = dataio.curl( url, req_headers=self.session, silent=False, cache=False, write_cache=True) data = data.split('\n') names = [] prg = progress.Progress(len(data), 'Loading gene sets', 1) for line in (l.split('\t') for l in data if len(l) > 0): prg.step() setname = line[0].strip() self.write_set(line[2:], setname, id_type, map_ids) self.get_desc(setname) names.append(setname) prg.terminate() self.groups[collname] = set(names) self.save([collname], cachedir=cachedir)
def annotate(graph, organism=9606, aspects=('C', 'F', 'P')): """ Adds Gene Ontology annotations to the nodes of a graph. :param igraph.Graph graph: Any ``igraph.Graph`` object with uniprot IDs in its ``name`` vertex attribute. """ aspects = aspects if type(aspects) in {list, tuple} else (aspects, ) graph.vs['go'] = [{ 'C': set(), 'F': set(), 'P': set() } for _ in xrange(graph.vcount())] terms, annot = dataio.go_annotations_goa(organism=organism) prg = progress.Progress(graph.vcount(), 'Loading GO annotations', 9) for v in graph.vs: prg.step() for asp in aspects: if v['name'] in annot[asp]: v['go'][asp] = annot[asp][v['name']] prg.terminate()
def compounds_targets_mechanism(self, id_list, id_type='uniprot', domains=False, pred_bind_d=False, activities=False, pchembl=False, one_query=False, client_side=False): if id_type == 'uniprot': compound_lookup = True id_list = self.get_chembl_uniprots(id_list) self.result = [] id_list = id_list if type(id_list) is list else [id_list] if one_query: query_thread = threading.Thread( target=self.compound_target_mechanism, args=[id_list], kwargs={ 'id_type': id_type, 'domains': domains, 'pred_bind_d': pred_bind_d, 'activities': activities, 'pchembl': pchembl }) query_thread.daemon = True query_thread.start() sys.stdout.write('\n') sys.stdout.flush() while query_thread.isAlive(): self.mysql.print_status() time.sleep(1) self.mysql_ready() if client_side: self.result = list(self.result) else: prg = progress.Progress(total=len(id_list), name='Sending queries', interval=5) qids = [] for identifier in id_list: prg.step() qids.append( self.compound_target_mechanism(identifier, id_type=id_type, domains=domains, pred_bind_d=pred_bind_d, activities=activities, pchembl=pchembl, wait=False)) prg.terminate() self.mysql_ready(qids) for qid in qids: self.result += list(self.mysql.get_result(qid))
def _make_df_igraph( self, unique_pairs = True, extra_node_attrs = None, extra_edge_attrs = None, ): """ See docs at method ``make_df``. """ self._log('Creating data frame from `legacy.main.PyPath` object.') result = [] self.pa.genesymbol_labels() self.extra_node_attrs = extra_node_attrs or self.extra_node_attrs self.extra_edge_attrs = extra_edge_attrs or self.extra_edge_attrs dtypes = ( self.default_dtypes_uniquepairs if unique_pairs else self.default_dtypes_bydirs ) header = self.get_header(unique_pairs = unique_pairs) prg = progress.Progress( total = self.graph.ecount(), name = 'Creating table', interval = 31 ) for e in self.graph.es: # adding default fields lines = ( self._process_edge_uniquepairs_igraph(e) if unique_pairs else self._process_edge_bydirection_igraph(e) ) result.extend(lines) prg.step() prg.terminate() self.df = pd.DataFrame(result, columns = header) self.df = self.df.astype(dtypes)
def inchikey2anything(self, target, lst): self.result = {} target = str(target) if type(target) is int else self.name_dict[target] prg = progress.Progress(total=len(lst), name='Translating InChi-Keys', interval=1) for inchik in lst: url = self.inchi_stem % inchik c = curl.Curl(url, large=False) result = c.result if result is not None: data = json.loads(result) self.result[inchik] = [ d['src_compound_id'] for d in data if d['src_id'] == target ] prg.step() prg.terminate()
def get_pubmeds(pmids): pmids = [str(pmid) for pmid in pmids] url = urls.urls['pubmed-eutils']['url'] cache = len(pmids) < 10 data = {} prg = progress.Progress(len(pmids) / 100 + 1, 'Retrieving data from NCBI e-utils', 1, percent=False) for offset in xrange(0, len(pmids), 100): prg.step() post = { 'id': ','.join(pmids[offset:offset + 100]), 'retmode': 'json', 'db': 'pubmed' } for i in xrange(3): try: c = curl.Curl( url, silent=False, cache=cache, post=post, override_post=True, ) res = c.result data = dict( [(k, v) for k, v in iteritems(json.loads(res)['result'])] + [(k, v) for k, v in iteritems(data)]) break except ValueError: sys.stdout.write('\t:: Error in JSON, retry %u\n' % i) sys.stdout.flush() prg.terminate() return data
def connectivity_search(self, id_list, id_type, parameters=[1, 0, 0, 0, 0, 1, 0]): ''' [1,0,0,0,0,1,0, 1] ''' ''' parameters is a list of parameters A-H as described in https://www.ebi.ac.uk/unichem/info/widesearchInfo ''' parameters.append(1) # H parameter must be 1 to process the result parameters = [str(i) for i in parameters] self.result = {} if id_type == 'inchikey': id_type = '' method = 'key_search' elif id_type == 'smiles': self.result = None return None else: id_type = str( id_type) if type(id_type) is int else self.name_dict[id_type] id_type = '%s/' % id_type method = 'cpd_search' prg = progress.Progress(total=len(id_list), name='Connectivity search', interval=1) for i in id_list: prg.step() url = self.cpd_search.format(method, i, id_type, '/'.join(parameters)) c = curl.Curl(url, large=False) result = c.result self.result[i] = [] if result is not None: data = json.loads(result) for k, v in iteritems(data): for j in range(1, len(v)): self.result[i].append(v[j][0]) self.result[i] = list(set(self.result[i])) prg.terminate()
def translate(self, source, target, lst): if source == 'inchikey': self.inchikey2anything(target, lst) return None if source == 'smiles': self.smiles2chembl(lst) return None self.result = {} source = str(source) if type(source) is int else self.name_dict[source] target = str(target) if type(target) is int else self.name_dict[target] prg = progress.Progress(total=len(lst), name='Translating compound identifiers', interval=1) for comp in lst: url = '/'.join([self.url_stem, comp, source, target]) c = curl.Curl(url, large=False) result = c.result self.result[comp] = [] if result is not None: data = json.loads(result) for d in data: self.result[comp].append(d['src_compound_id']) prg.step() prg.terminate()
def get_pfam(uniprots=None, organism=9606): if uniprots is None: uniprots = uniprot_input.all_uniprots( organism=organism, swissprot=True, ) u_pfam = {} pfam_u = {} if uniprots is not None: prg = progress.Progress( len(uniprots) / 30, 'Downloading data from UniProt', 1, ) data_all = [] for i in xrange(0, len(uniprots), 30): to = i + 30 thisPart = uniprots[i:to] thisPart = ' OR '.join(['accession:%s' % u for u in thisPart]) get = { 'query': thisPart, 'format': 'tab', 'columns': 'id,database(Pfam)' } for j in xrange(3): c = curl.Curl(urls.urls['uniprot_basic']['url'], get=get) data = c.result if data is not None: break if data is None: return None, None data = data.split('\n') del data[0] del data[-1] data_all += data prg.step() prg.terminate() else: organism = taxonomy.ensure_ncbi_tax_id(organism) if not organism: return None, None organismQuery = 'organism:%u AND reviewed:yes' % organism get = { 'query': organismQuery, 'format': 'tab', 'columns': 'id,database(Pfam)' } for j in xrange(3): c = curl.Curl( urls.urls['uniprot_basic']['url'], get=get, silent=False, outf='uniprot-pfam-%u.tab' % organism, ) data_all = c.result if data_all is not None: break if data_all is None: return None data_all = data_all.split('\n') del data_all[0] for l in data_all: l = l.split('\t') pfams = re.sub(';$', '', l[1]).strip() pfams = pfams.split(';') if pfams else [] if l[0] not in u_pfam: u_pfam[l[0]] = [] u_pfam[l[0]] += pfams for pfam in pfams: if pfam not in pfam_u: pfam_u[pfam] = [] pfam_u[pfam].append(l[0]) return u_pfam, pfam_u
def kegg_interactions(): """ Downloads and processes KEGG Pathways. Returns list of interactions. """ positive_terms = {'activation', 'expression'} negative_terms = {'inhibition', 'repression'} transc_terms = {'expression', 'repression'} mechanism_terms = { 'phosphorylation', 'binding/association', 'dissociation', 'ubiquitination', 'dephosphorylation', 'glycosylation', 'state change', 'methylation', } direct_terms = {'indirect effect'} KeggInteraction = collections.namedtuple( 'KeggInteraction', [ 'id_a', 'id_b', 'effect', 'pathway', 'mechanism', 'is_direct', 'transcriptional', ], ) rehsa = re.compile(r'.*(hsa[0-9]+).*') req_hdrs = [ 'Referer: http://www.genome.jp/kegg-bin/show_pathway' '?map=hsa04710&show_description=show' ] hsa_list = [] interactions = [] c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent=True) htmllst = c.result lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser') for a in lstsoup.find_all('a', href=True): m = rehsa.match(a['href']) if m: hsa_list.append((m.groups(0)[0], a.text)) prg = progress.Progress(len(hsa_list), 'Processing KEGG Pathways', 1, percent=False) for hsa, pw in hsa_list: prg.step() c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa, silent=True, req_headers=req_hdrs) kgml = c.result kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser') entries = {} for ent in kgmlsoup.find_all('entry'): gr = ent.find('graphics') if gr and 'name' in gr.attrs: entries[ent.attrs['id']] = [ n.strip() for n in gr.attrs['name'].replace('...', '').split(',') ] uentries = dict([(eid, common.uniq_list( common.flat_list([ mapping.map_name(gn, 'genesymbol', 'uniprot', strict=True) for gn in gns ]))) for eid, gns in iteritems(entries)]) for rel in kgmlsoup.find_all('relation'): subtypes = {st.attrs['name'] for st in rel.find_all('subtype')} if (rel.attrs['entry1'] in uentries and rel.attrs['entry2'] in uentries and subtypes): is_direct = 'indirect effect' not in subtypes effect = ('inhibition' if negative_terms & subtypes else 'activation' if positive_terms & subtypes else 'unknown') mechanism = ';'.join(mechanism_terms & subtypes) transcriptional = bool(transc_terms & subtypes) for u1 in uentries[rel.attrs['entry1']]: for u2 in uentries[rel.attrs['entry2']]: interactions.append( KeggInteraction( id_a=u1, id_b=u2, effect=effect, pathway=pw, mechanism=mechanism, is_direct=is_direct, transcriptional=transcriptional, )) prg.terminate() return common.uniq_list(interactions)
def get_pfam_regions( uniprots=[], pfams=[], keepfile=False, dicts='both', ): url = urls.urls['pfam_up']['url'] outf = url.split('/')[-1] urlmd5 = common.md5(url) if not os.path.exists(settings.get('cachedir')): os.makedirs(settings.get('cachedir')) cachefile = os.path.join(settings.get('cachedir'), urlmd5 + '-' + outf) u_pfam = {} pfam_u = {} uniprots = set(uniprots) pfams = set(pfams) if not os.path.exists(cachefile): sys.stdout.write( '\t:: Downloading data from %s' % url.replace('http://', '').replace('ftp://', '').split('/')[0]) sys.stdout.flush() if hasattr(urllib, 'urlretrieve'): urllib.urlretrieve(url, cachefile) else: urllib.request.urlretrieve(url, cachefile) sys.stdout.write('\n') with open(cachefile, 'rb') as f: f.seek(-4, 2) gzsize = struct.unpack('<I', f.read())[0] prg = progress.Progress(gzsize, 'Processing Pfam domains', 11) with gzip.open(cachefile, 'r') as f: # FIXME: Something went wrong here for l in f: prg.step(len(l)) l = l.strip().split() if l[0] in uniprots or l[4] in pfams: if dicts in ['uniprot', 'both']: if l[0] not in u_pfam: u_pfam[l[0]] = {} if l[4] not in u_pfam[l[0]]: u_pfam[l[0]][l[4]] = [] u_pfam[l[0]][l[4]].append({ 'isoform': int(l[1]), 'start': int(l[5]), 'end': int(l[6]) }) if dicts in ['pfam', 'both']: if l[4] not in pfam_u: pfam_u[l[4]] = {} if l[0] not in pfam_u[l[4]]: pfam_u[l[4]][l[0]] = [] pfam_u[l[4]][l[0]].append({ 'isoform': int(l[1]), 'start': int(l[5]), 'end': int(l[6]) }) prg.terminate() if not keepfile: os.remove(cachefile) if dicts == 'uniprot': return u_pfam elif dicts == 'pfam': return pfam_u else: return u_pfam, pfam_u
def pathwaycommons_interactions( resources=None, types=None, by_interaction=False, version=12, ): interactions = collections.defaultdict(set) if by_interaction else [] types = common.to_set(types) resources = { res.lower() for res in (common.to_list(resources) or ( pc_res.name for pc_res in pathwaycommons_resources)) } prg = progress.Progress( len(resources), 'Processing PathwayCommons', 1, percent=False, ) url = urls.urls['pwcommons']['url'] for resource in pathwaycommons_resources: if not resources & {resource.pc_label, resource.name.lower()}: continue prg.step() _version = min(resource.version, version) resource_url = url % (_version, _version, resource.pc_label) c = curl.Curl(resource_url, silent=False, large=True) for l in c.result: if hasattr(l, 'decode'): l = l.decode('ascii') l = l.strip('\n\r').split('\t') if not types or l[1] in types: if by_interaction: a_b = (l[0], l[1], l[2]) b_a = (l[2], l[1], l[0]) directed = l[1] in pathwaycommons_directed_types key = (b_a if (a_b not in interactions and not directed and b_a in interactions) else a_b) interactions[key].add( PathwayCommonsInteraction(*key, resource=resource.name)) else: l.append(resource.name) interactions.append(PathwayCommonsInteraction(*l)) return interactions
def resource_to_relationships_graph( self, graph, ) -> None: """ Convert a PyPath igraph object into list of BEL relationships. """ self._log('Building bel graph from PyPath object (igraph graph).') edges = graph.es prg = progress.Progress( len(edges), 'Building bel graph from PyPath object (igraph graph).', 1, ) for edge in edges: prg.step() directions = edge['dirs'] for direction in (directions.straight, directions.reverse): if not directions.dirs[direction]: # this direction does not exist continue dir_sources = directions.get_dir(direction, sources=True) if self.only_sources and not dir_sources & self.only_sources: # this direction not provided # in the currently enabled set of sources continue predicates = set() activation, inhibition = (directions.get_sign(direction, sources=True)) if self._check_sign(activation): predicates.add(pc.DIRECTLY_INCREASES) if self._check_sign(inhibition): predicates.add(pc.DIRECTLY_DECREASES) if not predicates: # use `regulates` if sign is unknown predicates.add(pc.REGULATES) source = self._protein(direction[0]) target = self._protein(direction[1]) evid_cits = self._references(edge, direction) for (predicate, (evid, cits)) in itertools.product(predicates, evid_cits): for cit in cits: self.bel_graph.add_qualified_edge( source, target, relation=predicate, citation=cit, evidence='OmniPath', ) self.bel_graph.add_qualified_edge( source, target, relation=predicate, citation=cit, evidence=evid, ) if not self._has_direction(directions): # add an undirected relationship # if no direction available evid_cits = self._references(edge, 'undirected') source = self._protein(directions.nodes[0]) target = self._protein(directions.nodes[1]) for evid, cits in evid_cits: for cit in cits: self.bel_graph.add_association( source, target, citation=cit, evidence='OmniPath', ) self.bel_graph.add_association( source, target, citation=cit, evidence=evid, ) prg.terminate() self._log('Building bel graph from PyPath object finished.')
def compounds_targets(self, id_list, id_type='uniprot', assay_types=['B', 'F'], relationship_types=['D', 'H'], compound_props=[], domains=False, pred_bind_d=False, action_type=False, activities=False, pchembl=False, one_query=False, client_side=False): ''' Same as compounds_targets(), but queries each id by separate mysql query. Better performance expected in case the batch query requires disk_tmp_table. ''' if id_type == 'uniprot': compound_lookup = True id_list = self.get_chembl_uniprots(id_list) self.result = [] id_list = id_list if type(id_list) is list else [id_list] if one_query: query_thread = threading.Thread(target=self.compound_target, args=[id_list], kwargs={ 'id_type': id_type, 'assay_types': assay_types, 'relationship_types': relationship_types, 'compound_props': compound_props, 'domains': domains, 'pred_bind_d': pred_bind_d, 'action_type': action_type, 'activities': activities, 'pchembl': pchembl }) query_thread.daemon = True query_thread.start() sys.stdout.write('\n') sys.stdout.flush() while query_thread.isAlive(): self.mysql.print_status() time.sleep(1) self.mysql_ready() if client_side: self.result = list(self.result) else: prg = progress.Progress(total=len(id_list), name='Starting queries', interval=5) qids = [] for identifier in id_list: prg.step() qids.append( self.compound_target(identifier, id_type=id_type, assay_types=assay_types, relationship_types=relationship_types, compound_props=compound_props, domains=domains, pred_bind_d=pred_bind_d, action_type=action_type, activities=activities, pchembl=pchembl, wait=False)) prg.terminate() self.mysql_ready(qids) for qid in qids: self.result.extend(list(self.mysql.get_result(qid)))
def signor_pathways(**kwargs): """ Obtains pathway annotations from Signor. """ url = urls.urls['signor']['list_url'] baseurl = urls.urls['signor']['all_url_new'] proteins_pathways = {} interactions_pathways = {} c = curl.Curl(url, silent=True) soup = bs4.BeautifulSoup(c.result, 'html.parser') pathway_names = [(opt['value'], opt.text) for opt in soup.find('select', { 'name': 'pathway_list' }).findAll('option')] prg = progress.Progress(len(pathway_names), 'Downloading data from Signor', 1, percent=False) for short, full in pathway_names: prg.step() if not short: continue binary_data = [(b'pathway_list', short.encode('ascii')), (b'submit', b'Download')] c_pw = curl.Curl( baseurl, silent=True, binary_data=binary_data, encoding='utf-8', ) #csv.DictReader(c_pw.result) sep = '@#@#@' lines = inputs_common.csv_sep_change(c_pw.result, '\t', sep).split('\n')[1:] data = list( filter(lambda l: len(l) > 6, map(lambda l: l.strip().split(sep), lines))) proteins_pathways[full] = set() interactions_pathways[full] = set() for row in data: for uniprot1, uniprot2 in itertools.product( mapping.map_name(row[4], 'uniprot', 'uniprot'), mapping.map_name(row[8], 'uniprot', 'uniprot'), ): proteins_pathways[full].add(uniprot1) proteins_pathways[full].add(uniprot2) interactions_pathways[full].add((uniprot1, uniprot2)) prg.terminate() return proteins_pathways, interactions_pathways
def kegg_interactions(): """ Downloads and processes KEGG Pathways. Returns list of interactions. """ rehsa = re.compile(r'.*(hsa[0-9]+).*') req_hdrs = [ 'Referer: http://www.genome.jp/kegg-bin/show_pathway' '?map=hsa04710&show_description=show' ] hsa_list = [] interactions = [] c = curl.Curl(urls.urls['kegg_pws']['list_url'], silent = True) htmllst = c.result lstsoup = bs4.BeautifulSoup(htmllst, 'html.parser') for a in lstsoup.find_all('a', href = True): m = rehsa.match(a['href']) if m: hsa_list.append((m.groups(0)[0], a.text)) prg = progress.Progress( len(hsa_list), 'Processing KEGG Pathways', 1, percent = False) for hsa, pw in hsa_list: prg.step() c = curl.Curl(urls.urls['kegg_pws']['kgml_url_2'] % hsa, silent = True, req_headers = req_hdrs) kgml = c.result kgmlsoup = bs4.BeautifulSoup(kgml, 'html.parser') entries = {} for ent in kgmlsoup.find_all('entry'): gr = ent.find('graphics') if gr and 'name' in gr.attrs: entries[ent.attrs['id']] = [ n.strip() for n in gr.attrs['name'].replace('...', '').split(',') ] uentries = dict([(eid, common.uniq_list( common.flat_list([ mapping.map_name( gn, 'genesymbol', 'uniprot', strict = True) for gn in gns ]))) for eid, gns in iteritems(entries)]) for rel in kgmlsoup.find_all('relation'): st = rel.find('subtype') if ( rel.attrs['entry1'] in uentries and rel.attrs['entry2'] in uentries and st and 'name' in st.attrs ): for u1 in uentries[rel.attrs['entry1']]: for u2 in uentries[rel.attrs['entry2']]: interactions.append((u1, u2, st.attrs['name'], pw)) prg.terminate() return common.uniq_list(interactions)
def intact_interactions( miscore = 0.6, organism = 9606, complex_expansion = False, only_proteins = False, only_ids = False, ): """ only_proteins : bool Keep only records of protein-protein interactions. only_ids : bool Load only the identifiers of interacting pairs (smaller memory footprint). """ id_types = { 'uniprotkb': 'uniprot', } IntactInteraction = collections.namedtuple( 'IntactInteraction', ( 'id_a', 'id_b', 'id_type_a', 'id_type_b', 'pubmeds', 'methods', 'mi_score', 'isoform_a', 'isoform_b', ), ) IntactInteraction.__new__.__defaults__ = (None,) * 7 def get_id_type(field): id_type = None if field == '-' else field.split(':')[0] return id_types[id_type] if id_type in id_types else id_type def get_id(field): if field == '-': return None, None else: uniprot, isoform = _try_isoform( field.split(':')[1].replace('"', '') ) uniprot = uniprot.split('-')[0] return uniprot, isoform def get_taxon(field): return ( 0 if field == '-' else field.split('|')[0].split(':')[1].split('(')[0] ) results = [] url = urls.urls['intact']['mitab'] if type(organism) is int: organism = '%u' % organism c = curl.Curl( url, silent = False, large = True, files_needed = ['intact.txt'], ) data = c.result['intact.txt'] size = c.sizes['intact.txt'] prg = progress.Progress(size, 'Reading IntAct MI-tab file', 99) for lnum, l in enumerate(data): prg.step(len(l)) if lnum == 0: continue l = l.strip('\n\r ').split('\t') taxon_a = get_taxon(l[9]) taxon_b = get_taxon(l[10]) if ( ( organism is None or ( taxon_a == organism and taxon_b == organism ) ) and ( complex_expansion or 'expansion' not in l[15] ) ): # finding mi-score and author sc = '0' au = '0' for s in l[14].split('|'): if s.startswith('intact-miscore'): sc = s.split(':')[1] if s.startswith('author'): au = len(s.split(':')[1]) # filtering for mi-score if float(sc) < miscore: continue id_type_a = get_id_type(l[0]) id_type_b = get_id_type(l[0]) if ( only_proteins and not ( id_type_a == 'uniprot' and id_type_b == 'uniprot' ) ): continue id_a, isoform_a = get_id(l[0]) id_b, isoform_b = get_id(l[1]) key = tuple(sorted((id_a, id_b))) pubmeds = set( ref[1] for ref in ( ref.split(':') for ref in l[8].split('|') ) if ref[0] == 'pubmed' ) methods = set( met.split('(')[1].strip(')"') for met in l[6].split('|') ) results.append( IntactInteraction( id_a = id_a, id_b = id_b, id_type_a = id_type_a, id_type_b = id_type_b, pubmeds = pubmeds, methods = methods, mi_score = sc, isoform_a = isoform_a, isoform_b = isoform_b, ) ) prg.terminate() return results