def parse_pseudomonas(session): # get all the kegg compounds if they were not obtained yet if not kegg_compounds: get_kegg_compounds() # add PAO1 and PA14 KEGG sources # Note: is_experimental is set to 2 since no way to deteremine if detection method was experimental or not source_PAO1 = InteractionSource(data_source='KEGG(PAO1)', is_experimental=2) source_PA14 = InteractionSource(data_source='KEGG(PA14)', is_experimental=2) session.add(source_PAO1), session.add(source_PA14), session.commit() # parse PAO1 and PA14 Kegg interactions parse_kegg('pae', 'PAO1', 'KEGG(PAO1)', session) parse_kegg('pau', 'PA14', 'KEGG(PA14)', session)
def parse(session): with open('Data/PAO1/Zhang.csv') as csvfile: reader = csv.DictReader(csvfile) source = InteractionSource(data_source='Zhang', is_experimental=0) session.add(source), session.commit() for row in reader: if float(row['Confidence']) < 0.9: continue interactor_A = session.query(Interactor).get(row['Protein1']) if interactor_A is None: continue interactor_B = session.query(Interactor).get(row['Protein2']) if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() if interaction is None: interaction = Interaction( strain='PAO1', homogenous=homogenous, type='p-p', interactors=[interactor_A, interactor_B]) interaction.sources.append(source) session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) reference = session.query(InteractionReference).filter_by( detection_method='computational prediction', pmid='22848443', interaction_type='predicted', confidence=row['Confidence'], comment=row['Comment']).first() if reference is None: reference = InteractionReference( detection_method='computational prediction', author_ln='Zhang', pub_date='2012', pmid='22848443', interaction_type='predicted', confidence=row['Confidence'], comment=row['Comment']) interaction.references.append(reference) reference.sources.append(source) else: if reference not in interaction.references: interaction.references.append(reference) if source not in reference.sources: reference.sources.append(source) session.commit() print('zhang', session.query(Interaction).count())
def parse(session): # parse ecocyc paths from file from EcoCyc and pyut into ecocyc_paths get_ecocyc_paths() # parse compounds from ecocyc interactor files, put into ecocyc_compounds get_ecocyc_compounds(session) update_metabolite_info_ecocyc(session) # create and add new source for EcoCyc (no references for any interactions so is_experimental = 2) source = InteractionSource(data_source='EcoCyc', is_experimental=2) session.add(source), session.commit() # parse PAO1 and PA14 separately parse_ecocyc('PAO1', session) parse_ecocyc('PA14', session)
def parse_ecoli(session): # get all kegg compounds if they were not obtained yet if not kegg_compounds: get_kegg_compounds() # update metabolite info for existing metabolites which may be missing ids update_metabolite_info_kegg(session) # add Ecoli KEGG source # Note: is_experimental is set to 2 since no way to determine if detection method was experimental or not source = InteractionSource(data_source='KEGG(Ecoli)', is_experimental=2) session.add(source), session.commit() # parse kegg interactions from Ecoli, doing PAO1 and PA14 ortholog mapping separately parse_kegg('eco', 'PAO1', 'KEGG(Ecoli)', session) parse_kegg('eco', 'PA14', 'KEGG(Ecoli)', session)
def parse(session): with open('Data/PAO1/xlinkdb.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') reference = InteractionReference( detection_method='chemical cross-linking mass spectrometry', interaction_type='physical association', author_ln='Navari', pub_date='2015', pmid='25800553', source_db='xlinkdb') source = InteractionSource(data_source='XLinkDB', is_experimental=1) source.references.append(reference) session.add(source), session.add(reference), session.commit() for row in reader: interactor_A = session.query(Interactor).get(row['proA']) if interactor_A is None: interactor_A = session.query(Protein).filter_by( uniprotkb=row['proA']).first() if interactor_A is None: continue interactor_B = session.query(Interactor).get(row['proB']) if interactor_B is None: interactor_B = session.query(Protein).filter_by( uniprotkb=row['proB']).first() if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() if interaction is None: interaction = Interaction( strain='PAO1', homogenous=homogenous, type='p-p', interactors=[interactor_A, interactor_B]) interaction.references.append(reference) interaction.sources.append(source) session.add(interaction), session.commit() else: if reference not in interaction.references: interaction.references.append(reference) if source not in interaction.sources: interaction.sources.append(source) session.commit() print('xlinkdb', session.query(Interaction).count())
def parse(session): with open('Data/PAO1/GeoffWinsor.csv') as csvfile: reader = csv.DictReader(csvfile) source = InteractionSource(data_source='Geoff', is_experimental=1) session.add(source), session.commit() for row in reader: interactor_A = session.query(Interactor).get(row['locus_tag']) if interactor_A is None: continue row = next(reader) interactor_B = session.query(Interactor).get(row['locus_tag']) if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() if interaction is None: interaction = Interaction(strain='PAO1', homogenous=homogenous , type='p-p', interactors = [interactor_A, interactor_B]) interaction.sources.append(source) session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) reference = session.query(InteractionReference).filter_by(detection_method=row['experimental_type'], pmid=row['pmid']).first() if reference is None: reference = InteractionReference(detection_method=row['experimental_type'], pmid=row['pmid']) interaction.references.append(reference) reference.sources.append(source) else: if interaction not in reference.interactions: reference.interactions.append(interaction) if source not in reference.sources: reference.sources.append(source) session.commit() print('geoff', session.query(Interaction).count())
def parse(session): with open('Data/Ecoli/RegulonDB.csv') as csvfile: reader = csv.DictReader(csvfile) # since all the interactions from here will use the same source, create and add it at the beginning # Note: since no references are available, is_experimental is set to 2 source = InteractionSource(data_source='RegulonDB(Ecoli)', is_experimental=2) session.add(source), session.commit() for row in reader: interactors = [] orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_name=(row['TF name'][0].lower() + row['TF name'][1:])).all() # if no orthologs for first interactor were found, skip to next interaction if orthologs_A is None: continue orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_name=row['Regulated gene']).all() # if no orthologs for second interactor were found, skip to next interaction if orthologs_B is None: continue # iterate through each ortholog in ortholog A and B to create interactor pairs from their # respective pseudomonas proteins for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: # only add the pseudomonas interactors if their strains match if ortholog_A.strain_protein == ortholog_B.strain_protein: # make sure to add ortholog id for creating the interaction reference later interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) # iterate through each interactor pair, create a new interaction if it doesnt exist yet for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is None: # if interaction is None, make ortholog_derived = Ecoli and add source to interaction sources interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type='p-p', ortholog_derived='Ecoli') interaction.sources.append(source) session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] type = 'TF/sigma-binding site (' + row[ 'Regulatory effect'] + 'regulation)' comment = interactor_pair[0][1] + ' regulates(' + row[ 'Regulatory effect'] + ') ' + interactor_pair[1][1] # create a reference for each evidence type listed for interaction for evidence in row['Evidence'][1:-1].split(', '): # check if interaction reference already exists in db reference = session.query(InteractionReference).filter_by( detection_method=evidence, interaction_type=type, source_db='regulondb', confidence=row['Evidence type'], comment=comment, interactor_a=interactor_a, interactor_b=interactor_b).first() if reference is None: # if reference is None, add reference to interaction references list and add source # to reference sources list reference = InteractionReference( detection_method=evidence, interaction_type=type, comment=comment, source_db='regulondb', confidence=row['Evidence type'], interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(reference) reference.sources.append(source) # if reference exists, check that its interactions contains interaction, and sources contains # source, and add if they are not present else: if interaction not in reference.interactions: interaction.references.append(reference) if source not in reference.sources: reference.sources.append(source) session.commit() print('regulondb', session.query(Interaction).count())
def parse_ecoli_uniprot(session): with open('Ecoli/PSICQUIC/UniProt.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_B[1]).all() if len(orthologs_B) == 0: continue orthologs_A = [] metabolite = None id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'chebi': metabolite = session.query(Metabolite).filter( Metabolite.chebi == id_A[1]).first() if metabolite is None: metabolite = Metabolite(id=id_A[1], chebi=id_A[1]) session.add(metabolite), session.commit() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) if metabolite is not None: for ortholog_B in orthologs_B: interactors.append( [[metabolite, metabolite.id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') if 'MI:' in row['Interaction detection method(s)']: if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, psimi_detection=row['Interaction detection method(s)']. split('MI:')[1][:4], detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split( '(')[1][:-1], pmid=row['Publication Identifier(s)'].split( 'pubmed:')[1].split('|')[0], psimi_type=row['Interaction type(s)'].split('MI:')[1][:4], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], psimi_db=row['Source database(s)'].split('MI:')[1][:4], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)'], interactor_a_id=interactor_a, interactor_b_id=interactor_b) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'UniProt').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='UniProt') session.add(source) session.commit() print(session.query(Interaction).count())
def parse(session): # create and add sources for the interactions (do this before since they all use the same source) # Note: is_experimental is set to 2 because we cannot confirm that detection method was experimental or not source_PAO1 = InteractionSource(data_source='Galan-Vasquez(PAO1)', is_experimental=2) source_PA14 = InteractionSource(data_source='Galan-Vasquez(PA14)', is_experimental=2) session.add(source_PAO1), session.add(source_PA14), session.commit() with open('Data/PAO1_PA14/regulatory_network.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: # a row describing an interaction may have >1 strain strains = row['Strain'].split(',') for strain in strains: # only care about PAO1 and PA14 strain interactions if (strain != 'PAO1') and (strain != 'PA14'): continue # search for interactor A by name interactor_A = session.query(Protein).filter_by( name=row['Regulator'], strain=strain).first() # if no interactor was found by name, id listed may be a gene locus, so search by this id if interactor_A is None: interactor_A = session.query(Interactor).get( row['Regulator']) # if no interactor A was found for this interaction, skip to next if interactor_A is None: continue # same as A above interactor_B = session.query(Protein).filter_by( name=row['Target'], strain=strain).first() if interactor_B is None: interactor_B = session.query(Interactor).get(row['Target']) if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() # if interaction between these 2 interactors does not yet exist, create and add it if interaction is None: interaction = Interaction( strain=strain, type='p-p', homogenous=homogenous, interactors=[interactor_A, interactor_B]) session.add(interaction), session.commit() # specify the source to be used for the interaction and reference based on strain of interaction source = None if strain == 'PAO1': source = source_PAO1 else: source = source_PA14 # add the source to the interaction source list if it isn't there already if source not in interaction.sources: interaction.sources.append(source) # get source db and detections if they are present in the file source_db, detections = None, [None] if row['source_db'] != '': source_db = row['source_db'] if row['evidence'] != '': del detections[0] for type in row['evidence'].split(', '): detections.append(type) # create a new reference for each detection found, add the reference to the interaction's # reference list, and add the source to the reference's sources for detection in detections: reference = InteractionReference( detection_method=detection, pmid=row['pmid'], interaction_type='TF/sigma-binding site (' + row['mode'] + 'regulation)', source_db=source_db, comment=interactor_A.id + ' regulates(' + row['mode'] + ') ' + interactor_B.id) interaction.references.append(reference) reference.sources.append(source) session.commit() print('regnet', session.query(Interaction).count())
def parse_mpidb(session): with open('PAO1/PSICQUIC/MPIDB.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] if (row['Taxid interactor A'].split('|')[0] != 'taxid:208964(pseae)') |\ (row['Taxid interactor B'].split('|')[0] != 'taxid:208964(pseae)'): continue A_id = row['#ID(s) interactor A'].split(':')[1] B_id = row['ID(s) interactor B'].split(':')[1] if session.query(Interactor).filter( Interactor.id == A_id).first() is not None: interactors.append( session.query(Interactor).filter( Interactor.id == A_id).one()) elif session.query(Protein).filter( Protein.uniprotkb == A_id).first() is not None: interactors.append( session.query(Protein).filter( Protein.uniprotkb == A_id).one()) if session.query(Interactor).filter( Interactor.id == B_id).first() is not None: interactors.append( session.query(Interactor).filter( Interactor.id == B_id).one()) elif session.query(Protein).filter( Protein.uniprotkb == B_id).first() is not None: interactors.append( session.query(Protein).filter( Protein.uniprotkb == B_id).one()) if len(interactors) != 2: continue homogenous = (interactors[0] == interactors[1]) interaction = session.query(Interaction).filter( (Interaction.interactors.contains(interactors[0])), (Interaction.interactors.contains(interactors[1])), (Interaction.homogenous == homogenous)).first() if interaction is None: type = interactors[0].type + '-' + interactors[1].type interaction = Interaction(strain='PAO1', type=type, homogenous=homogenous, interactors=interactors) if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 session.add(interaction), session.commit() else: if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)'].split( '(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split('(')[1][:-1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1][:8], confidence=row['Confidence value(s)'], interaction_type=row['Interaction type(s)'].split('(')[1][:-1], source_db=row['Source database(s)']) session.add(reference) for xref in row['Interaction identifier(s)'].split('|'): xref_field = xref.split(':') xref = session.query(InteractionXref).filter( InteractionXref.accession == xref_field[1], InteractionXref.interaction_id == interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'MPIDB').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='MPIDB') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_psimi(session, file, source): with open(file) as csvfile: reader = csv.DictReader(csvfile, fieldnames=cols, delimiter='\t') # iterate through each interaction for row in reader: uniprot_A, refseq_A, orthologs_A, uniprot_B, refseq_B, orthologs_B = None, None, None, None, None, None # if one of the interactors is metabolite, save it's ids in pubchem and chebi pubchem, chebi = None, None # if one of the interactors is a metabolite, metabolite will be that metabolite and orthologs # will be set to the interaction's protein ortholog(s) metabolite_info, metabolite, orthologs = None, None, None # check if interactor A has uniprot or refseq id if 'uniprotkb' in row['interactor_A']: uniprot_A = row['interactor_A'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_A']: refseq_A = row['interactor_A'].split('refseq:')[1].split( '|')[0] # if uniprot id was found, look for orthologs matching that id if uniprot_A is not None: orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_uniprot=uniprot_A).all() # if no orthologs were found but a refseq id was found, try to find ortholog based on refseq if (orthologs_A is None) and (refseq_A is not None): orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_refseq=refseq_A).all() # if no orthologs were found for interactor A, but a uniprot or refseq does exist, # that means the ecoli interactor A is a protein without orthologs, so continue to next interaction if (orthologs_A is None) & ((uniprot_A is not None) | (refseq_A is not None)): continue # same as for interactor A above if 'uniprotkb' in row['interactor_B']: uniprot_B = row['interactor_B'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_B']: refseq_B = row['interactor_B'].split('refseq:')[1].split( '|')[0] if uniprot_B is not None: orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_uniprot=uniprot_B).all() if (orthologs_B is None) and (refseq_B is not None): orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_refseq=refseq_B).all() if (orthologs_B is None) & ((uniprot_B is not None) | (refseq_B is not None)): continue # if both orthologs_A and orthologs_B are None, then there are no protein interactors for this # interaction, so move on to the next interaction if (orthologs_A is None) and (orthologs_B is None): continue # if there were no orthologs for interactor A (and no refseq or uniprot was found), # search the file for pubchem or chebi ids for interactor A (as it may be a metabolite) if orthologs_A is None: if 'chebi' in row['interactor_A']: chebi = row['interactor_A'].split('CHEBI:')[1].split( '|')[0][:-1] if 'pubchem' in row['altID_A']: pubchem = row['altID_A'].split('pubchem:')[1].split('|')[0] if (chebi is None) & ('chebi' in row['altID_A']): chebi = row['altID_A'].split('CHEBI:')[1].split( '|')[0][:-1] # if no metabolite ids were found in the interaction row, then move on to the next interaction # because no interactor_A was identified if (chebi is None) & (pubchem is None): continue # if a pubchem or chebi id was found, then this interaction will be a p-m interaction, so # set the protein interactors(orthologs) to orthologs_B orthologs = orthologs_B # other case where orthologs_B were not identified so need to check if interactor B has metabolite ids elif orthologs_B is None: if 'chebi' in row['interactor_B']: chebi = row['interactor_B'].split('CHEBI:')[1].split( '|')[0][:-1] if 'pubchem' in row['altID_B']: pubchem = row['altID_B'].split('pubchem:')[1].split('|')[0] if (chebi is None) & ('chebi' in row['altID_B']): chebi = row['altID_B'].split('CHEBI:')[1].split( '|')[0][:-1] if (chebi is None) & (pubchem is None): continue orthologs = orthologs_A # if one of the interactors was identified to be a metabolite, search for the metabolite and set metabolite # variable to that value. if the metabolite doesnt exist create it # Note: if this point was reached, it means one of the interactors had protein orthologs, # so we can safely create a new metabolite knowing it will have a protein interaction partner if (chebi is not None) | (pubchem is not None): id = None # preferentially set id for new metabolites to be chebi if chebi is not None: id = chebi metabolite = session.query(Metabolite).filter_by( chebi=chebi).first() # if no metabolite with chebi was found, but pubchem id exists, try to find # metabolite with that pubchem if (metabolite is None) & (pubchem is not None): id = pubchem metabolite = session.query(Metabolite).filter_by( pubchem=pubchem).first() # if no metabolite was found with pubchem or chebi id, create new metabolite if metabolite is None: metabolite = Metabolite(id=id, chebi=chebi, pubchem=pubchem) session.add(metabolite) # if a metabolite was found, update its chebi and pubchem if it has none else: if metabolite.pubchem is None: metabolite.pubchem = pubchem if metabolite.chebi is None: metabolite.chebi = chebi # list of interactor pairs for interaction interactors = [] # if no metabolite was found for interaction, it is a p-p interaction, so iterate through # orthologs to create interactor pairs if metabolite is None: for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): # only add the interactor pair if the protein strains match if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append([[ ortholog_A.protein, ortholog_A.ortholog_id ], [ ortholog_B.protein, ortholog_B.ortholog_id ]]) else: # if a metabolite was found, add pairs of all orthologs with metabolite to interactor pairs for ortholog in orthologs: interactors.append( [[metabolite, metabolite.id], [ortholog.protein, ortholog.ortholog_id]]) # for each interactor pair, create interaction if it doesnt exist, otherwise update attributes for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is None: # since one of the interactors may be a metabolite, set strain to match strain of protein strain = None if interactor_pair[0][0].type == 'p': strain = interactor_pair[0][0].strain else: strain = interactor_pair[1][0].strain # if interaction did not exist, set it to Ecoli ortholog derived interaction = Interaction( strain=strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), ortholog_derived='Ecoli') session.add(interaction), session.commit() ref_parameter_list = get_psimi_ref_list(row) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] is_experimental = is_experimental_interaction(row) # check to see if source exists nsource = session.query(InteractionSource).filter_by( data_source=source, is_experimental=is_experimental).first() # if source doesn't exist, create and add it to the interaction's sources if nsource is None: nsource = InteractionSource( data_source=source, is_experimental=is_experimental) interaction.sources.append(nsource) # if the source does exist, add it to the interaction's sources if it isn't already elif nsource not in interaction.sources: interaction.sources.append(nsource) # go through each reference in the ref_parameter list, search for it, and if it doesnt exist create it for ref in ref_parameter_list: nref = session.query(InteractionReference).filter_by( detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=interactor_a, interactor_b=interactor_b).first() # if nref doesn't exist, create and add it to the interaction's reference list, # and add the source to the reference's sources if nref is None: nref = InteractionReference(detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(nref) nref.sources.append(nsource) # if nref does exist, add the interaction and source to it's attributes if they aren't added else: if interaction not in nref.interactions: nref.interactions.append(interaction) if nsource not in nref.sources: nref.sources.append(nsource) session.commit() print(source, session.query(Interaction).count())
def parse_ecoli_bindingdb(session): with open('Data/Ecoli/PSICQUIC/BindingDB.txt') as csvfile: reader = csv.DictReader(csvfile) # iterate through each interaction for row in reader: uniprot_protein = None # check if interactor B has uniprot ID if 'uniprotkb' in row['ID(s) interactor B']: uniprot_protein = row['ID(s) interactor B'].split( 'uniprotkb:')[1].split('|')[0] if uniprot_protein is None: continue orthologs = [] for ecoli_ortholog in session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_protein).all(): if ecoli_ortholog is not None: orthologs.append( [ecoli_ortholog.protein, ecoli_ortholog.ortholog_id]) if len(orthologs) == 0: continue ids_metabolite = row['#ID(s) interactor A'].split('|') chebi_metabolite, pubchem_metabolite = None, None # check if interactor A has ChEBI id for id in ids_metabolite: if id.split(':')[0] == 'chebi': chebi_metabolite = id.split(':')[1][1:-1] metabolite = None # if interactor A has ChEBI id, query for matching metabolite if chebi_metabolite is not None: metabolite = session.query(Metabolite).filter( Metabolite.chebi == chebi_metabolite).first() # if unable to identify metabolite based on ChEBI id, try using pubchem id if metabolite is None: alt_ids_metabolite = row['Alt. ID(s) interactor A'].split('|') for id in alt_ids_metabolite: if id.split(':')[0] == 'pubchem': pubchem_metabolite = id.split(':')[1] metabolite = session.query(Metabolite).filter( Metabolite.id == pubchem_metabolite).first() # if unable to find interactor A in database, create new metabolite if metabolite is None: metabolite = Metabolite(id=pubchem_metabolite, pubchem=pubchem_metabolite, chebi=chebi_metabolite) session.add(metabolite), session.commit() for interactor in orthologs: interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor[0]), Interaction.interactors.contains(metabolite)).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor.strain, interactors=[metabolite, interactor[0]], type='p-m', ortholog_derived='fe') # should ortholog interactions be marked as experimental? if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == metabolite: interactor_a = metabolite.id interactor_b = interactor[1] else: interactor_b = metabolite.id interactor_a = interactor[1] author, date, pmid = None, None, None if row['Publication 1st author(s)'] != '-': author = row['Publication 1st author(s)'].split(' ')[0] date = row['Publication 1st author(s)'].split('(')[1][:-1] if 'pubmed:' in row['Publication Identifier(s)']: pmid = row['Publication Identifier(s)'].split( 'pubmed:')[1][:8] reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=author, pmid=pmid, pub_date=date, interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], confidence=row['Confidence value(s)'].split('(')[0], interactor_a=interactor_a, interactor_b=interactor_b) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'BindingDB').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='BindingDB') session.add(source) session.add(reference) session.commit()
def parse_psimi(file, strain, source, session): with open(file) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=cols) next(reader) for row in reader: uniprot_A, refseq_A, interactor_A, uniprot_B, refseq_B, interactor_B = None, None, None, None, None, None # check if interactor A has uniprot or refseq id, store these values if 'uniprotkb' in row['interactor_A']: uniprot_A = row['interactor_A'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_A']: refseq_A = row['interactor_A'].split('refseq:')[1].split( '|')[0] # if a uniprot id was found, try to find the interactor in the database if uniprot_A is not None: # check if there is a protein-complex with this uniprot id interactor_A = session.query(Interactor).get(uniprot_A) # if no protein complex, check for protein matching the uniprot id if interactor_A is None: interactor_A = session.query(Protein).filter_by( uniprotkb=uniprot_A).first() # if no interactor A was found but there was also a refseq id, try to find the protein based on # it's refseq if (interactor_A is None) and (refseq_A is not None): interactor_A = session.query(Protein).filter_by( ncbi_acc=refseq_A).first() # if no interactor A was found, move on to next interaction if interactor_A is None: continue # same as for A above if 'uniprotkb' in row['interactor_B']: uniprot_B = row['interactor_B'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_B']: refseq_B = row['interactor_B'].split('refseq:')[1].split( '|')[0] if uniprot_B is not None: interactor_B = session.query(Interactor).get(uniprot_B) if interactor_B is None: interactor_B = session.query(Protein).filter_by( uniprotkb=uniprot_B).first() if (interactor_B is None) and (refseq_B is not None): interactor_B = session.query(Protein).filter_by( ncbi_acc=refseq_B).first() if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() # if no interaction was found with the interactors, create a new interaction if interaction is None: interaction = Interaction( strain=strain, type='p-p', homogenous=homogenous, interactors=[interactor_A, interactor_B]) session.add(interaction), session.commit() ref_parameter_list = get_psimi_ref_list(row) is_experimental = is_experimental_interaction(row) # check to see if source exists nsource = session.query(InteractionSource).filter_by( data_source=source, is_experimental=is_experimental).first() # if source doesn't exist, create and add it to the interaction's sources if nsource is None: nsource = InteractionSource(data_source=source, is_experimental=is_experimental) interaction.sources.append(nsource) # if the source does exist, add it to the interaction's sources if it isn't already elif nsource not in interaction.sources: interaction.sources.append(nsource) # go through each reference in the ref_parameter list, search for it, and if it doesnt exist create it for ref in ref_parameter_list: nref = session.query(InteractionReference).filter_by( detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=None, interactor_b=None).first() # if nref doesn't exist, create and add it to the interaction's reference list, # and add the source to the reference's sources if nref is None: nref = InteractionReference(detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6]) interaction.references.append(nref) nref.sources.append(nsource) # if nref does exist, add the interaction and source to it's attributes if they aren't added else: if interaction not in nref.interactions: nref.interactions.append(interaction) if nsource not in nref.sources: nref.sources.append(nsource) #collect all the cross references for the interaction for xref in row['identifier'].split('|'): xref_field = xref.split(':') # check if the cross reference exists for this interaction, if it doesnt create it xref = session.query(InteractionXref).filter_by( accession=xref_field[1], interaction_id=interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) session.commit() print(source, session.query(Interaction).count())
def parse_ecoli_ebi_goa_nonintact(session): with open('Ecoli/PSICQUIC/EBI-GOA-nonIntAct.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] uniprot_A, uniprot_B = None, None if 'uniprotkb:' in row['#ID(s) interactor A']: uniprot_A = row['#ID(s) interactor A'].split('uniprotkb:')[1] if 'uniprotkb:' in row['ID(s) interactor B']: uniprot_B = row['ID(s) interactor B'].split('uniprotkb:')[1] if (uniprot_A is None) | (uniprot_B is None): continue orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_A).all() orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_B).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=interactor_pair, type='p-p', ortholog_derived='fe') if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split('(')[1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], interactor_a_id=row['#ID(s) interactor A'].split(':')[1], interactor_b_id=row['ID(s) interactor B'].split(':')[1]) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'EBI-GOA non-IntAct').first() if source is None: source = InteractionSource( interaction_id=interaction.id, data_source='EBI-GOA non-IntAct') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_irefindex(file, strain, taxid, session): with open(file) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] if ((row['Taxid interactor A'].split('|')[0] != taxid) | (row['Taxid interactor B'].split('|')[0] != taxid)): continue A_id = row['#ID(s) interactor A'].split(':') B_id = row['ID(s) interactor B'].split(':') if A_id[0] == 'uniprotkb': if session.query(Interactor).filter(Interactor.id == A_id[1]).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == A_id[1]).one()) elif session.query(Protein).filter(Protein.uniprotkb == A_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == A_id[1]).one()) elif A_id[0] == 'refseq': if session.query(Protein).filter(Protein.ncbi_acc == A_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.ncbi_acc == A_id[1]).one()) if B_id[0] == 'uniprotkb': if session.query(Interactor).filter(Interactor.id == B_id[1]).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == B_id[1]).one()) elif session.query(Protein).filter(Protein.uniprotkb == B_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == B_id[1]).one()) elif B_id[0] == 'refseq': if session.query(Protein).filter(Protein.ncbi_acc == B_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.ncbi_acc == B_id[1]).one()) if len(interactors) != 2: continue homogenous = (interactors[0] == interactors[1]) interaction = session.query(Interaction).filter((Interaction.interactors.contains(interactors[0])), (Interaction.interactors.contains(interactors[1])), (Interaction.homogenous == homogenous)).first() if interaction is None: type = interactors[0].type + '-' + interactors[1].type interaction = Interaction(strain=strain, type=type, homogenous=homogenous, interactors=interactors) if row['Interaction detection method(s)'] != '-': if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 else: if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 elif (row['Interaction detection method(s)'] == '-') and (interaction.is_experimental == 0): interaction.is_experimental = None author, date, type= None, None, None pmids, detections = [None], [None] if row['Interaction detection method(s)'] != '-': del detections[0] for method in row['Interaction detection method(s)'].split('|'): detections.append(method.split('(')[1][:-1]) if (row['Interaction type(s)'] != '-'): type = row['Interaction type(s)'].split('(')[1][:-1] if (row['Publication 1st author(s)'] != '-'): author = row['Publication 1st author(s)'].split('-')[0][0].upper() + \ row['Publication 1st author(s)'].split('-')[0][1:] date = row['Publication 1st author(s)'].split('-')[1] if (row['Publication Identifier(s)'] != '-'): del pmids[0] for pmid in row['Publication Identifier(s)'].split('|'): pmids.append(pmid.split('pubmed:')[1][:8]) for pmid in pmids: for detection in detections: reference = InteractionReference(interaction_id=interaction.id, detection_method=detection, author_ln=author, pub_date=date, pmid=pmid, interaction_type=type, source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)']) session.add(reference) for xref in row['Interaction identifier(s)'].split('|'): xref_field = xref.split(':') xref = session.query(InteractionXref).filter(InteractionXref.accession == xref_field[1], InteractionXref.interaction_id == interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) source = session.query(InteractionSource).filter(InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'iRefIndex').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='iRefIndex') session.add(source) print(session.query(Interaction).count())
def parse_ecoli_dip(session): with open('Ecoli/DIP.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] ids_A = row['ID interactor A'].split('|') ids_B = row['ID interactor B'].split('|') refseq_A, uniprotkb_A, refseq_B, uniprotkb_B = '', '', '', '' for id in ids_A: fields = id.split(':') if fields[0] == 'refseq': refseq_A = fields[1] elif fields[0] == 'uniprotkb': uniprotkb_A = fields[1] for id in ids_B: fields = id.split(':') if fields[0] == 'refseq': refseq_B = fields[1] elif fields[0] == 'uniprotkb': uniprotkb_B = fields[1] orthologs_A, orthologs_B = [], [] if uniprotkb_A != '': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprotkb_A).all() if (len(orthologs_A) == 0) & (refseq_A != ''): orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == refseq_A).all() if uniprotkb_B != '': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprotkb_B).all() if (len(orthologs_B) == 0) & (refseq_B != ''): orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == refseq_B).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: is_new = 0 homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: is_new = 1 interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type='p-p', ortholog_derived='fe') session.add(interaction), session.commit() detections, pmids, types, list = [], [], [], [] if row['Interaction detection method(s)'] != '-': detections = row['Interaction detection method(s)'].split( '|') list.append(detections) if row['Publication Identifier(s)'] != '-': pmids = row['Publication Identifier(s)'].split('|') list.append(pmids) if row['Interaction type(s)'] != '-': types = row['Interaction type(s)'].split('|') list.append(types) interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] for num in range(0, len(list[0])): type = types[num].split('(')[1][:-1] pmid = pmids[num * 2].split('pubmed:')[1] detection = detections[num].split('(')[1][:-1] # there are more than one pmid sometimes reference = InteractionReference( interaction_id=interaction.id, detection_method=detection, pmid=pmid, source_db=row['Source database(s)'].split('(')[1][:-1], interactor_a=interactor_a, interactor_b=interactor_b) session.add(reference) if is_new: if interaction.is_experimental is None: if is_experimental_psimi( row['Interaction detection method(s)']. split('MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 elif is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'DIP').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='DIP') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_mentha(file, strain, taxid, session): with open(file) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] if ((row['Taxid interactor A'].split('|')[0] != taxid) | (row['Taxid interactor B'].split('|')[0] != taxid)): continue A_id = row['#ID(s) interactor A'].split(':')[1] B_id = row['ID(s) interactor B'].split(':')[1] if session.query(Interactor).filter(Interactor.id == A_id).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == A_id).one()) elif session.query(Protein).filter(Protein.uniprotkb == A_id).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == A_id).one()) if session.query(Interactor).filter(Interactor.id == B_id).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == B_id).one()) elif session.query(Protein).filter(Protein.uniprotkb == B_id).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == B_id).one()) if len(interactors) != 2: continue homogenous = (interactors[0] == interactors[1]) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactors[0]), Interaction.interactors.contains(interactors[1]), Interaction.homogenous == homogenous).first() if interaction is None: type=(interactors[0].type + '-' + interactors[1].type) interaction = Interaction(strain=strain, type=type, homogenous=homogenous, interactors=interactors) if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 session.add(interaction), session.commit() else: if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 reference = InteractionReference(interaction_id=interaction.id, detection_method=row['Interaction detection method(s)'].split('(')[1][:-1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1][:8], interaction_type=row['Interaction type(s)'].split('(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)']) session.add(reference) xref_field = row['Interaction identifier(s)'].split(':') xref = session.query(InteractionXref).filter(InteractionXref.accession == xref_field[1], InteractionXref.interaction_id == interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) source = session.query(InteractionSource).filter(InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'mentha').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='mentha') session.add(source) print(session.query(Interaction).count())
def parse_ecoli_mentha(session): with open('Ecoli/PSICQUIC/mentha.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: if (row['#ID(s) interactor A'] == '-') | (row['ID(s) interactor B'] == '-'): continue interactors = [] orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == row['#ID(s) interactor A'].split(':')[1]).all() orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == row['ID(s) interactor B'].split(':')[1]).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') #ask about marking ecoli ortholog interactions as experimental!! if 'MI:' in row['Interaction detection method(s)']: #iterate through all methods if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, psimi_detection=row['Interaction detection method(s)']. split('MI:')[1][:4], detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1], psimi_type=row['Interaction type(s)'].split('MI:')[1][:4], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], psimi_db=row['Source database(s)'].split('MI:')[1][:4], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)']) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'mentha').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='mentha') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_ecoli_irefindex(session): with open('Ecoli/PSICQUIC/iRefIndex.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: if (row['#ID(s) interactor A'] == '-') | (row['ID(s) interactor B'] == '-'): continue interactors = [] orthologs_A = [] id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'refseq': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == id_A[1]).all() if len(orthologs_A) == 0: continue orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_B[1]).all() elif id_B[0] == 'refseq': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == id_B[1]).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') if 'MI:' in row['Interaction detection method(s)']: #iterate through all methods if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] author, date, psimi_type, type = None, None, None, None confidences, psimi_detections, detections, pmids = [None], [ None ], [None], [None] if row['Publication 1st author(s)'] != '-': author = row['Publication 1st author(s)'].split(' ')[0] date = row['Publication 1st author(s)'].split('(')[1][:-1] if row['Interaction type(s)'] != '-': type = row['Interaction type(s)'].split('(')[1][:-1] if 'MI' in row['Interaction type(s)']: psimi_type = row['Interaction type(s)'].split( 'MI:')[1][:4] if row['Publication Identifier(s)'] != '-': del pmids[0] for pmid in row['Publication Identifier(s)'].split('|'): pmids.append(pmid.split(':')[1]) if row['Interaction detection method(s)'] != '-': del detections[0] del psimi_detections[0] for detection in row['Publication Identifier(s)'].split( '|'): detections.append(detection.split('(')[1][:-1]) psimi_detections.append(detection.split('MI:')[1][:4]) for pmid in pmids: for confidence in confidences: for (detection, psimi_detection) in zip(detections, psimi_detections): reference = InteractionReference( interaction_id=interaction.id, psimi_detection=psimi_detection, detection_method=detection, author_ln=author, date=date, psimi_type=psimi_type, interaction_type=type, psimi_db=row['Source database(s)'].split( 'MI')[1][:4], source_db=row['Source database(s)'].split( '(')[1][:-1], confidence=confidence, interactor_a=interactor_a, interactor_b=interactor_b) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'iRefIndex').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='iRefIndex') session.add(source) session.commit() print(session.query(Interaction).count())