def csConsistencyCheck(self): """Perform a consistency check of this record against chemspider. Raise a ValidationError on error.""" if not self.custom: errorList = [] cs = ChemSpider(settings.CHEMSPIDER_TOKEN) if self.CSID is None or self.CSID is '': raise ValidationError('No CSID set', 'no_csid') else: csCompound = cs.get_compound(self.CSID) if self.name not in ('', None): nameResults = cs.simple_search(self.name) if csCompound not in nameResults: errorList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid name', code='invalid_inchi')) else: self.name = csCompound.common_name if self.INCHI == '': self.INCHI = csCompound.stdinchi elif self.INCHI != csCompound.stdinchi: errorList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid InChi', code='invalid_inchi')) if self.smiles == '': self.smiles = csCompound.smiles elif self.smiles != csCompound.smiles: errorList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid smiles string', code='invalid_smiles')) if self.formula == '': self.formula = csCompound.molecular_formula elif self.formula != csCompound.molecular_formula: errorsList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid formula', code="invalid_formula")) if len(errorList) > 0: raise ValidationError(errorList)
def __init__(self): """ Initializes all the object variables """ # Reaction Dataframe self.reactions_dataframe = None # Reactant Dataframe self.species_df = None # Unique Reactants Dictionary self.unique_species_dict = None # Creating a transator for cleaning individual reactants off non-familiar characters self.translator = str.maketrans("Î", "α", "±€™") # Argument style # (# intab,outtab,character string that should be mapped to None) # Autheticating ChemSpider API using the token self.security_token = "99c9f388-12be-4b22-8f83-00b6f1e2d7d0" # Maneet's token self.cs = ChemSpider( self.security_token, user_agent="StudentResearcher, ChemSpiPy 1.0.5, Python 3.6") print('--Populator Initialized--')
def get_image_url(self): md = jsonpickle.decode(self.metadata) if 'csid' in md: # If this doc already has a csid, make the url return 'http://www.chemspider.com/ImagesHandler.ashx?id=' + str( self.csid) elif 'InChIKey' in md or 'inchikey' in md: # If it doesnt but it does have an InChIKey get the csid and make the image url # this code doesn't work...due to an upgrade in chemspider # if you want images, get the mol from chemspipy import ChemSpider cs = ChemSpider(settings.CHEMSPIDER_APIKEY) ikey = md.get('InChIKey', md.get('inchikey')) results = cs.search(ikey) if results: # Return the image_url and also save the csid csid = results[0].csid md['csid'] = csid self.metadata = jsonpickle.encode(md) self.save() return results[0].image_url else: return None else: # If it has neither, no image! return None
def find_matches(matched_in_ChemSpider, massFile_Name): from chemspipy import ChemSpider cs = ChemSpider('dfdc677d-e7d3-435b-a74e-bfe6167a3899') for i in matched_in_ChemSpider.keys(): print i # intialiaztion matched_compounds = [] matches = {} # load mol file info of the product product_molFile = read_product_molFile(massFile_Name, i) # for each compound in data base with almost the same mass for CSID in matched_in_ChemSpider[i]: # extract the compound's mol file c = cs.get_compound(CSID) ChemSpider_compound_mol_info = c.mol_2d # compare the product's and compound's mol files is_the_same = compare_two_molFiles(product_molFile, ChemSpider_compound_mol_info) # add the compound to the list if it's molfile is the same as the product's if is_the_same: matched_compounds.append(CSID) # if at least one compound found as a match if matched_compounds != []: matches.update({i: matched_compounds}) # return the whole matches for products return matches
def set_and_initialize_token(self, input_token): """ Stores you ChemSpider security token as an object attribute and Associate your token to the ChemSpider api :param input_token: your security token (for ChemSpider) :return: None """ self.security_token = input_token self.cs = ChemSpider(self.security_token)
def __init__(self, user, *args, **kwargs): """Overridden version of the init method allows us to place the user's lab groups as a restricted set.""" super(CompoundForm, self).__init__(*args, **kwargs) self.compound = None self.chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) self.fields['labGroups'].queryset = user.labgroup_set.all() if user.labgroup_set.all().exists(): self.fields['labGroups'].empty_label = None
def clean_name(self): """Check the name is a valid synonym.""" chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) nameResults = chemSpider.simple_search(self.cleaned_data['name']) if self.instance.CSID not in (nameResult.csid for nameResult in nameResults): raise ValidationError( "That name is not a known synonym for this compound") else: return self.cleaned_data['name']
def search_by_mass(mass, margine): # pip install chemspipy from chemspipy import ChemSpider # register to generate a sequrity code cs = ChemSpider('dfdc677d-e7d3-435b-a74e-bfe6167a3899') # search the data base CSIDs = [] for result in cs.simple_search_by_mass(mass, margine): CSIDs.append(result.csid) return CSIDs
def find_common_name(inchikey, formula): # Try to find the common name for the compound, if not use the formula. name = formula if chemspikey: cs = ChemSpider(chemspikey) if (len(inchikey) > 0): result = cs.search(inchikey) if (len(result) == 1): name = result[0].common_name return name
def find_common_name(inchikey): # Try to find the common name for the compound, if not, return None. name = None if chemspikey: cs = ChemSpider(chemspikey) if (len(inchikey) > 0): result = cs.search(inchikey) if (len(result) == 1): name = result[0].common_name return name
def structure_url(self): from chemspipy import ChemSpider try: cs_key = settings.CHEMSPIDER_KEY except AttributeError: url = 'http://discovermagazine.com/~/media/Images/Zen%20Photo/N/nanoputian/3487.gif' else: cs = ChemSpider(cs_key) IUPAC = self.name search_results = cs.simple_search(IUPAC) try: url = search_results[0].image_url except IndexError: url = "" return url
def database_setup(): """ Download 2D & 3D molecule structure from ChemSpider server to create a database """ from chemspipy import ChemSpider # compile id list for calling molecules id_list = get_id() directory = DATABASE # make directory database_chemspider/ if needed if os.path.isdir(directory): print('Database folder already existed! Aborting... \n ' 'Please remove the folder and rerun') exit() else: os.mkdir(directory) print('downloading..') os.chdir(directory) # change dir to database_chemspider/ # access API key cs = ChemSpider('text') # go through each id for id_chemspider in id_list: if os.path.exists(str(id_chemspider) + '_2d.txt'): # pass if id already exist print('ID ' + str(id_chemspider) + ' already existed') continue # access molecule data c = cs.get_compound(id_chemspider) # write 2d coord and bond data f = open(str(id_chemspider) + '_2d.txt', 'w') f.write(c.mol_2d) f.close() # write 3d coord and bond data f = open(str(id_chemspider) + '_3d.txt', 'w') f.write(c.mol_3d) f.close() os.chdir('../')
def get_chemspider_structure(csid): """ Get a molecular structure from ChemSpider, generate a PDB file of the structure, and return the name of the PDB file """ pdbpath = '{}.pdb'.format(csid) token = 'a03b1636-afc3-4204-9a2c-ede27680577c' # XXX cs = ChemSpider(token) cmpd = cs.get_compound(csid) conv = ob.OBConversion() conv.SetInAndOutFormats('mol', 'pdb') mol = ob.OBMol() conv.ReadString(mol, cmpd.mol_3d) mol.AddHydrogens() with open(pdbpath, 'w') as f: f.write(conv.WriteString(mol)) return pdbpath
def get_image_url(self): md = jsonpickle.decode(self.metadata) if 'csid' in md: # If this doc already has a csid, make the url return 'http://www.chemspider.com/ImagesHandler.ashx?id=' + str(self.csid) elif 'InChIKey' in md: # If it doesnt but it does have an InChIKey get the csid and make the image url from chemspipy import ChemSpider cs = ChemSpider('b07b7eb2-0ba7-40db-abc3-2a77a7544a3d') results = cs.search(md['InChIKey']) if results: # Return the image_url and also save the csid csid = results[0].csid md['csid'] = csid self.metadata = jsonpickle.encode(md) self.save() return results[0].image_url else: return None else: # If it has neither, no image! return None
def smiles2cas(smiles_input): myToken = 'a1d50aa3-6729-49df-a3e1-cd66240fab22' cs = ChemSpider(security_token=myToken) comp = cs.search(smiles_input) for result in comp: temp = result res = temp.csid res = str(res) http = requests.session() url = 'http://www.chemspider.com/MassSpecApi.asmx/GetExtendedCompoundInfoArray' params = {'token': myToken} http.post(url, data=params) url_search = 'http://www.chemspider.com/Search.aspx?q=' + res r = http.get(url_search) soup = bs4.BeautifulSoup(r.text, "html.parser") cas = [a.attrs.get('href') for a in soup.select('div.syn a[title="RN"]')] for x in range(len(cas)): cas[x] = re.findall(r"\"(.+?)\"", cas[x]) return (cas)
from nose.tools import eq_, ok_, raises import requests import six from chemspipy import ChemSpider, MOL2D, MOL3D, BOTH from chemspipy.errors import ChemSpiPyAuthError, ChemSpiPyServerError logging.basicConfig(level=logging.WARN) logging.getLogger('chemspipy').setLevel(logging.DEBUG) # Security token is retrieved from environment variables CHEMSPIDER_SECURITY_TOKEN = os.environ['CHEMSPIDER_SECURITY_TOKEN'] # Chemspider instances with and without a security token cs = ChemSpider(security_token=CHEMSPIDER_SECURITY_TOKEN) cs2 = ChemSpider() def test_no_security_token(): """Test ChemSpider can be initialized with no parameters.""" eq_(cs2.security_token, None) def test_security_token(): """Test security token is set correctly when initializing ChemSpider""" eq_(cs.security_token, CHEMSPIDER_SECURITY_TOKEN) def test_chemspider_repr(): """Test ChemSpider object repr."""
def __init__(self): sett = SettingsConstants() self.key = sett.get('CHEMSPI_KEY') self.url = sett.get('CHEMSPI_API_URL') self.cs = ChemSpider(self.key, api_url=self.url)
if not args.from_db: from chemspipy import ChemSpider if not args.export_db_only: import pandas as pd if args.from_db or args.export_db_only or args.export_db_csv: import shelve ## ==================== set up chemspider ==================== if not args.from_db: possiblefile = os.path.expanduser(args.token) if os.path.exists(possiblefile): # is file with open(possiblefile) as f: csp = ChemSpider(f.read().strip()) else: csp = ChemSpider(args.token) # else is token else: csp = None spq = spiderquery(csp, args.prefix + '_p') ## ==================== list of compounds ==================== if args.inputfile: with open(args.inputfile) as csvfile: f = csv.reader(csvfile) compounds = [] j = 0 for i, row in enumerate(f):
description="Script to obtain SMILES for a solutes in a list") argparser.add_argument('-db', '--db', help="the molecule database") argparser.add_argument('-solvent', '--solvent', help="the solvent", default="water") argparser.add_argument('-solutes', '--solutes', help="the list of solutes") args = argparser.parse_args() db = dblib.SolvDb(filename=args.db, type="abs", filehandle="^0") solutes = [s.strip() for s in open(args.solutes, 'r').readlines()] if os.getenv("SPIDERKEY") is None: print "SPIDERKEY environmental variable not set! Exit." quit() cs = ChemSpider(os.getenv("SPIDERKEY")) # Loop over all the database entries in the solute lists n = 0 for entry in db.itersolutelist(args.solvent, solutes): if os.path.exists(entry.FileHandle + ".smi"): continue hits = cs.search(entry.SoluteName) if len(hits) > 0: smi = hits[0].smiles with open(entry.FileHandle + ".smi", "w") as f: f.write("%s\n" % smi) else: print entry.SoluteName, entry.FileHandle n += 1 print "Looped over %d solutes" % n
from chemspipy import ChemSpider cs = ChemSpider('c48d4595-ead2-40e7-85c9-1e5d2a77754c') def get_chem(query): chem = None results = cs.search(query) if results: name = results[0].common_name smiles = results[0].smiles chem = {'name': name, 'smiles': smiles} return chem def get_smiles(query): chem = None results = cs.search(query) if results: smiles = results[0].smiles return smiles else: return None
def __init__(self, api_key): self.chemspider_web_api = ChemSpider(api_key)
def process(self, input_text: str = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, input_type: str = "", lang: str = "eng", paged_text: bool = False, format_output: bool = True, opsin_types: list = None, standardize_mols: bool = True, convert_ions: bool = True, write_header: bool = True, iob_format: bool = False, dry_run: bool = False, csv_delimiter: str = ";", normalize_text: bool = True, remove_duplicates: bool = False, annotate: bool = True, annotation_sleep: int = 2, chemspider_token: str = "", continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with ChemSpot. Parameters ---------- input_text : str String to be processed by ChemSpot. input_file : str Path to file to be processed by ChemSpot. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. SDF is from OPSIN converted entities. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. SDF is from OPSIN converted entities. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf", "pdf_scan", "image" or "text" and magic bytes check will be skipped. lang : str | Language which will Tesseract use for OCR. Available languages: https://github.com/tesseract-ocr/tessdata | Multiple languages can be specified with "+" character, i.e. "eng+bul+fra". paged_text : bool If True and `input_type` is "text" or `input_text` is provided, try to assign pages to chemical entities. ASCII control character 12 (Form Feed, '\f') is expected between pages. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. opsin_types : list | List of ChemSpot entity types. Entities of types in this list will be converted with OPSIN. If you don't want to convert entities, pass empty list. | OPSIN is designed to convert IUPAC names to linear notation (SMILES etc.) so default value of `opsin_types` is ["SYSTEMATIC"] (these should be only IUPAC names). | ChemSpot entity types: "SYSTEMATIC", "IDENTIFIER", "FORMULA", "TRIVIAL", "ABBREVIATION", "FAMILY", "MULTIPLE" standardize_mols : bool If True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules converted by OPSIN. convert_ions : bool If True, try to convert ion entities (e.g. "Ni(II)") to SMILES. Entities matching ion regex won't be converted with OPSIN. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header: "smiles", "bond_length", "resolution", "confidence", "learn", "page", "coordinates" iob_format : bool If True, output will be in IOB format. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. normalize_text : bool If True, normalize text before performing NER. It is strongly recommended to do so, because without normalization can ChemSpot produce unpredictable results which cannot be parsed. remove_duplicates : bool If True, remove duplicated chemical entities. Note that some entities-compounds can have different names, but same notation (SMILES, InChI etc.). This will only remove entities with same names. Not applicable for IOB format. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for entity name, SMILES etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. | If textual entity has single result in DB when searched by name, fill in missing identifiers (SMILES etc.). annotation_sleep: int How many seconds to sleep between annotation of each entity. It's for preventing overloading of databases. chemspider_token : str Your personal token for accessing the ChemSpider API (needed for annotation). Make account there to obtain it. continue_on_failure : bool | If True, continue running even if ChemSpot returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from ChemSpot - stderr: str ... standard error output from ChemSpot - exit_code: int ... exit code from ChemSpot - content - list of OrderedDicts ... when `format_output` is True - None ... when `format_output` is False - normalized_text : str """ if opsin_types is None: opsin_types = ["SYSTEMATIC"] if input_text and input_file: input_file = "" self.logger.warning("Both 'input_text' and 'input_file' are set, but 'input_text' will be prefered.") elif not input_text and not input_file: raise ValueError("One of 'input_text' or 'input_file' must be set.") if not input_type and not input_text: possible_input_types = ["pdf", "image", "text"] input_type = get_input_file_type(input_file) if input_type not in possible_input_types: raise ValueError("Input file type ({}) is not one of {}".format(input_type, possible_input_types)) elif input_type and not input_text: possible_input_types = ["pdf", "pdf_scan", "image", "text"] if input_type not in possible_input_types: raise ValueError("Unknown 'input_type'. Possible 'input_type' values are {}".format(possible_input_types)) if input_type in ["pdf", "pdf_scan", "image"]: input_text, _ = get_text(input_file, input_type, lang=lang, tessdata_prefix=os.environ["TESSDATA_PREFIX"]) input_file = "" if annotate and not chemspider_token: self.logger.warning("Cannot perform annotation in ChemSpider: 'chemspider_token' is empty.") options = ChainMap({k: v for k, v in {"iob_format": iob_format}.items() if v}, self.options_internal) output_file_temp = None commands, _, _ = self.build_commands(options, self._OPTIONS_REAL, self.path_to_binary) commands.insert(1, str(self.options_internal["max_memory"])) commands.append("-t") if normalize_text: normalizer = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, slashes=True, tildes=True, ellipsis=True) if input_file: with open(input_file, mode="r") as f: input_text = f.read() input_text = normalizer(input_text) if not input_text: raise UserWarning("'input_text' is empty after normalization.") input_text = self.normalize_text(text=input_text) input_file_normalized = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_normalized.write(input_text) input_file_normalized.flush() input_file = input_file_normalized.name else: if input_text: input_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_temp.write(input_text) input_file_temp.flush() input_file = input_file_temp.name commands.append(os.path.abspath(input_file)) commands.append("-o") if format_output: output_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") commands.append(os.path.abspath(output_file_temp.name)) else: commands.append(os.path.abspath(output_file)) if dry_run: return " ".join(commands) stdout, stderr, exit_code = common_subprocess(commands) if "OutOfMemoryError" in stderr: raise RuntimeError("ChemSpot memory error: {}".format(stderr)) to_return = {"stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None, "normalized_text": input_text if normalize_text else None} if not continue_on_failure and exit_code > 0: self.logger.warning("ChemSpot error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if normalize_text: to_return["normalized_text"] = input_text if not format_output: return to_return elif format_output: with open(output_file_temp.name, mode="r", encoding="utf-8") as f: output_chs = f.read() entities = self.parse_chemspot_iob(text=output_chs) if iob_format else self.parse_chemspot(text=output_chs) to_return["content"] = entities if remove_duplicates and not iob_format: seen = set() seen_add = seen.add to_return["content"] = [x for x in to_return["content"] if not (x["entity"] in seen or seen_add(x["entity"]))] if input_type in ["pdf", "pdf_scan"] or paged_text: page_ends = [] for i, page in enumerate(input_text.split("\f")): if page.strip(): try: page_ends.append(page_ends[-1] + len(page) - 1) except IndexError: page_ends.append(len(page) - 1) if opsin_types: if convert_ions: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types and not self.re_ion.match(x["entity"])] else: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types] if to_convert: opsin = OPSIN(verbosity=self.verbosity) opsin_converted = opsin.process(input=to_convert, output_formats=["smiles", "inchi", "inchikey"], standardize_mols=standardize_mols, output_file_sdf=output_file_sdf, sdf_append=sdf_append) opsin_converted = iter(opsin_converted["content"]) else: self.logger.info("Nothing to convert with OPSIN.") if annotate: chemspider = ChemSpider(chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): if input_type in ["pdf", "pdf_scan"] or paged_text: ent["page"] = str(bisect.bisect_left(page_ends, int(ent["start"])) + 1) if convert_ions: match_ion = self.re_ion.match(ent["entity"]) if match_ion: match_ion = match_ion.groupdict() match_charge = self.re_charge.search(match_ion["charge"]) if match_charge: match_charge = match_charge.groupdict() if match_charge["roman"]: smiles = "[{}+{}]".format(match_ion["ion"], len(match_charge["roman"])) elif match_charge["digit"]: if "+" in match_ion["charge"]: smiles = "[{}+{}]".format(match_ion["ion"], match_charge["digit"]) elif "-" in match_ion["charge"]: smiles = "[{}-{}]".format(match_ion["ion"], match_charge["digit"]) elif match_charge["signs"]: smiles = "[{}{}{}]".format(match_ion["ion"], match_charge["signs"][0], len(match_charge["signs"])) mol = MolFromSmiles(smiles) if mol: inchi = MolToInchi(mol) if inchi: ent.update(OrderedDict( [("smiles", smiles), ("inchi", inchi), ("inchikey", InchiToInchiKey(inchi))])) else: ent.update(OrderedDict([("smiles", smiles), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) if opsin_types and to_convert: if ent["entity"] in to_convert: ent_opsin = next(opsin_converted) ent.update(OrderedDict([("smiles", ent_opsin["smiles"]), ("inchi", ent_opsin["inchi"]), ("inchikey", ent_opsin["inchikey"]), ("opsin_error", ent_opsin["error"])])) elif convert_ions and self.re_ion.match(ent["entity"]): ent.update(OrderedDict([("opsin_error", "")])) elif (convert_ions and not self.re_ion.match(ent["entity"])) or (not convert_ions and ent["entity"] not in to_convert): ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", ""), ("opsin_error", "")])) # TODO: this should be simplified...looks like garbage code if annotate: self.logger.info("Annotating entity {}/{}...".format(i + 1, len(to_return["content"]))) ent.update(OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_name", ""), ("chs_cids_by_name", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_cids_by_formula", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) # do "double-annotation": some entities can be found in only one DB, updated and then searched in second DB found_in_pch = False found_in_chs = False for _ in range(2): results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search(ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) else: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results = get_compounds(ent["entity"] or ent["abbreviation"], "name") if results: if len(results) == 1: found_in_pch = True result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) # only update identifiers if they weren't found in second DB if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_name"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results = chemspider.search(ent["entity"] or ent["abbreviation"]) if chemspider_token else [] if results: if len(results) == 1: found_in_chs = True result = results[0] if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_common_name"] = result.common_name ent["chs_cids_by_name"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) for search_field, col_pch, col_chs in [("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi"), ("formula", "pch_cids_by_formula", "")]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent["smiles"] and "*" not in ent["smiles"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent["inchi"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["inchi"]) if chemspider_token else [] elif search_field == "formula": if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["entity"], "formula") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass # ChemSpider doesn't have search field for 'formula' if results_pch: ent[col_pch] = "\"{}\"".format(",".join([str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join([str(c.csid) for c in results_chs])) sleep(0.5) sleep(annotation_sleep) if not found_in_pch and not found_in_chs: break if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) return to_return
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException from dotenv import load_dotenv load_dotenv() DISCORD_TOKEN = os.getenv('DISCORD_TOKEN') CHEMSPIDER_TOKEN = os.getenv('CHEMSPIDER_TOKEN') WOLFRAM_TOKEN = os.getenv('WOLFRAM_TOKEN') cs = ChemSpider(CHEMSPIDER_TOKEN) wolfram = wolframalpha.Client(WOLFRAM_TOKEN) client = commands.Bot(command_prefix='!') op = webdriver.ChromeOptions() op.binary_location = os.getenv('GOOGLE_CHROME_BIN') op.add_argument('--headless') op.add_argument('--no-sandbox') op.add_argument('--disable-dev-sh-usage') driver = webdriver.Chrome(executable_path=os.getenv('CHROMEDRIVER_PATH'), chrome_options=op) # for local testing purposes only; comment out when deployed to Heroku #driver = webdriver.Firefox()
from chemspipy import ChemSpider cs = ChemSpider("CHEMSPIDER_API_KEY") def remove_prefix(disctext): """Removes the command prefix from the passed string""" cprefix = "!chem " if disctext.content.startswith(cprefix): return disctext.content[len(cprefix):] def getCompound(id): """Gets a compound object from a compound name or id""" slist = cs.search(id) # Returns list of compounds compound = slist[0] # Fetches top item of list as compound object return compound
#python script #This uses the api package chemspider to maybe spit out information about chemical compounds #import the ChemSpider API package from chemspipy import ChemSpider import csv #for importing data import os #for working directory import pandas as pd #apparently this is better for making a working directory import numpy as np #this is something else for dataframes #define the chemSpider object with my security code cs = ChemSpider('6c2e700b-6a92-4551-9cc1-70f28c021f23') #example of how to get a compound info from the ChemSpider ID #compound = cs.get_compound(2157) #print(compound.smiles) #working directory info #cwd = os.getcwd() os.chdir('/Users/mkamarck/Documents/chemspipy') #change working directory #figure out how to make a dataframe or matrix variable f = open('SymriseOdorList_forChemSpiPy.csv') csv_f = csv.reader(f) for row in csv_f: print row # 660002', 'ACETANISOLE CRYST.', '100-06-1', 'KETONE', 'S' #I can get chem ID from the CAS number #for row in csv_f: # print row[3]
import CoolProp from chemspipy import ChemSpider from chemspipy_key import key # private file with the key (DO NOT COMMIT!!) import glob, json cs = ChemSpider(key) # Map from name to Chemspider ID backup_map = { 'Propyne': 6095, 'R236EA': 71342, 'R245ca': 62827, 'trans-2-Butene': 56442, 'Oxygen': 952, 'Fluorine': 22932, 'Hydrogen': 762, 'Deuterium': 22931, 'HFE143m': 66577, 'SulfurHexafluoride': 16425, 'R114': 13853215 } # Make sure the key works c = cs.get_compound(2157) assert (c.inchikey == 'BSYNRYMUTXBXSQ-UHFFFAOYAW') for fname in glob.glob('../fluids/*.json'): with open(fname, 'r') as fp: jj = json.load(fp) fluid = jj['INFO']['NAME']
def handle(self, *args, **kwargs): folder = kwargs['directory'] with open(path.join(folder, 'User.tsv')) as userFile: reader = csv.DictReader(userFile, delimiter='\t') for r in reader: if not User.objects.filter(username=r['username']).exists(): u = User( username=r['username'], first_name=r['first_name'], last_name=r['last_name'], email=r['email'], is_staff=int(r['is_staff']), is_superuser=int(r['is_superuser']), ) u.password = r['password'] u.save() with open(path.join(folder, 'labGroup.tsv')) as labGroups: reader = csv.DictReader(labGroups, delimiter='\t') for r in reader: if not LabGroup.objects.filter(title=r['title']).exists(): l = LabGroup(**r) l.save() with open(path.join(folder, 'labgroup_users.tsv')) as labGroupUsers: reader = csv.DictReader(labGroupUsers, delimiter='\t') for r in reader: l = LabGroup.objects.get(title=r['title']) l.users.add(User.objects.get(username=r['username'])) if not path.isfile(path.join(folder, 'performedReactionsNoDups.tsv')): self.stdout.write( 'Writing file with duplicate references disambiguated (arbitrarily)' ) with open(path.join(folder, 'performedReactions.tsv')) as in_file, open( path.join(folder, 'performedReactionsNoDups.tsv'), 'w') as out_file: references = set() reader = csv.DictReader(in_file, delimiter='\t') writer = csv.DictWriter(out_file, delimiter='\t', fieldnames=reader.fieldnames) writer.writeheader() case_count = 0 valid_case_count = 0 dup_count = 0 for r in reader: ref = r['reference'].lower() if ref != r['reference']: self.stderr.write( 'Reference {} was not in lowercase. Converted.'. format(r['reference'])) case_count += 1 if r['valid'] == '1': valid_case_count += 1 if ref in references: r['notes'] += ' Duplicated reference' r['valid'] = 0 dup_count += 1 i = 1 new_ref = ref while new_ref in references: new_ref = '{}_dup{}'.format(ref, i) i += 1 self.stderr.write( 'Reference {} duplicated {} times. Renamed and invalidated' .format(ref, i)) ref = new_ref references.add(ref) r['reference'] = ref writer.writerow(r) self.stderr.write( '{} references converted to lowercase. {} were valid'.format( case_count, valid_case_count)) self.stderr.write( '{} references with _dupX appended to remove duplicate reference' .format(dup_count)) with open(path.join(folder, 'performedReactionsNoDups.tsv')) as reactions: reader = csv.DictReader(reactions, delimiter='\t') for r in reader: if not PerformedReaction.objects.filter( reference=r['reference'].lower()).exists(): p = PerformedReaction( reference=r['reference'], labGroup=LabGroup.objects.get( title=r['labGroup.title']), notes=r['notes'], user=User.objects.get(username=r['user.username']), valid=int(r['valid']), legacyRecommendedFlag=r['legacyRecommendedFlag'] == 'Yes', insertedDateTime=r['insertedDateTime'], public=int(r['public'])) self.stdout.write( 'Creating reaction with reference {}'.format( p.reference)) p.validate_unique() p.save(calcDescriptors=False) with open(path.join(folder, 'performedReactionsNoDups.tsv')) as reactions: reader = csv.DictReader(reactions, delimiter='\t') outValues = [] outBoolValues = [] purityValues = [] temperatureValues = [] timeValues = [] pHValues = [] preHeatStandingValues = [] teflonValues = [] for r in reader: self.stdout.write( 'Reiterating for reaction with reference {}'.format( r['reference'].lower())) ps = PerformedReaction.objects.filter( reference=r['reference'].lower()) if ps.count() > 1: ps = ps.filter(valid=True) if ps.exists(): if ps.count() > 1: raise RuntimeError( '{} has more than one reaction'.format( r['reference'].lower())) p = ps[0] try: p.duplicateOf = PerformedReaction.objects.get( reference=r['duplicateOf.reference'].lower()) p.save() except PerformedReaction.DoesNotExist: pass #outValue = OrdRxnDescriptorValue.objects.get_or_create(descriptor=outcomeDescriptor, reaction=p)[0] outcomeValue = int(r['outcome']) if (r['outcome'] in ( str(x) for x in range(1, 5))) else None try: v = OrdRxnDescriptorValue.objects.get( descriptor=outcomeDescriptor, reaction=p) if v.value != outcomeValue: v.value = outcomeValue v.save() except OrdRxnDescriptorValue.DoesNotExist: outValue = outcomeDescriptor.createValue( p, outcomeValue) # outValue.save() outValues.append(outValue) #outBoolValue = BoolRxnDescriptorValue.objects.get_or_create(descriptor=outcomeBooleanDescriptor, reaction=p)[0] value = True if (outcomeValue > 2) else False try: v = BoolRxnDescriptorValue.objects.get( descriptor=outcomeBooleanDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: # outBoolValue.save() outBoolValue = outcomeBooleanDescriptor.createValue( p, value) outBoolValues.append(outBoolValue) #purityValue = OrdRxnDescriptorValue.objects.get_or_create(descriptor=purityDescriptor, reaction=p)[0] value = int(r['purity']) if (r['purity'] in ('1', '2')) else None try: v = OrdRxnDescriptorValue.objects.get( descriptor=purityDescriptor, reaction=p) if v.value != value: v.value = value v.save() except OrdRxnDescriptorValue.DoesNotExist: # purityValue.save() purityValue = purityDescriptor.createValue(p, value) purityValues.append(purityValue) #temperatureDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=temperatureDescriptor, reaction=p)[0] value = (float(r['temp']) + 273.15) if (r['temp'] not in ('', '?')) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=temperatureDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: # temperatureDescriptorValue.save() temperatureDescriptorValue = temperatureDescriptor.createValue( p, value) temperatureValues.append(temperatureDescriptorValue) #timeDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=timeDescriptor, reaction=p)[0] value = float(r['time']) * 60 if ( r['time'] not in ['', '?']) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=timeDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: # timeDescriptorValue.save() timeDescriptorValue = timeDescriptor.createValue( p, value) timeValues.append(timeDescriptorValue) #pHDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=pHDescriptor, reaction=p)[0] value = float(r['pH']) if (r['pH'] not in ('', '?')) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=pHDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: # pHDescriptorValue.save() pHDescriptorValue = pHDescriptor.createValue(p, value) pHValues.append(pHDescriptorValue) #preHeatStandingDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=preHeatStandingDescriptor, reaction=p)[0] value = bool(r['pre_heat standing']) if ( r.get('pre_heat standing') not in ('', None)) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=preHeatStandingDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: # preHeatStandingDescriptorValue.save() preHeatStandingDescriptorValue = preHeatStandingDescriptor.createValue( p, value) preHeatStandingValues.append( preHeatStandingDescriptorValue) #teflonDescriptorValue = BoolRxnDescriptorValue.objects.get_or_create(descriptor=teflonDescriptor, reaction=p)[0] value = bool(int(r['teflon_pouch'])) if ( r.get('teflon_pouch') not in (None, '')) else None try: v = BoolRxnDescriptorValue.objects.get( descriptor=teflonDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: # teflonDescriptorValue.save() teflonDescriptorValue = teflonDescriptor.createValue( p, value) teflonValues.append(teflonDescriptorValue) if len(outValues) > 500: self.stdout.write("Saving...") OrdRxnDescriptorValue.objects.bulk_create(outValues) BoolRxnDescriptorValue.objects.bulk_create( outBoolValues) OrdRxnDescriptorValue.objects.bulk_create(purityValues) NumRxnDescriptorValue.objects.bulk_create( temperatureValues) NumRxnDescriptorValue.objects.bulk_create(timeValues) NumRxnDescriptorValue.objects.bulk_create(pHValues) NumRxnDescriptorValue.objects.bulk_create( preHeatStandingValues) BoolRxnDescriptorValue.objects.bulk_create( teflonValues) outValues = [] outBoolValues = [] purityValues = [] temperatureValues = [] timeValues = [] pHValues = [] preHeatStandingValues = [] teflonValues = [] self.stdout.write("...saved") with open(path.join(folder, 'compound_labs.tsv')) as compounds: reader = csv.DictReader(compounds, delimiter='\t') cs = ChemSpider(settings.CHEMSPIDER_TOKEN) for r in reader: l = LabGroup.objects.get(title=r['labGroup.title']) if not Compound.objects.filter(abbrev=r['abbrev']).exists(): self.stdout.write( 'Importing compound with abbreviation {} and name {}'. format(r['abbrev'], r['name'])) if r.get('custom') != '1': try: if r.get('CSID') not in ('', None): c = Compound(CSID=r['CSID'], labGroup=l, abbrev=r['abbrev']) c.csConsistencyCheck() c.save() else: if r.get('CAS_ID') not in (None, ''): CASResults = cs.simple_search(r['CAS_ID']) else: CASResults = [] if len(CASResults) != 1: nameResults = cs.simple_search( r.get('name')) if len(nameResults) != 1: raise RuntimeError( 'Could not get unambiguous chemspider entry for CAS ID {} with name{}' .format(r['CAS_ID'], r['name'])) else: c = Compound(CSID=nameResults[0].csid, labGroup=l, abbrev=r['abbrev']) c.csConsistencyCheck() c.save() else: c = Compound(CSID=CASResults[0].csid, labGroup=l, abbrev=r['abbrev']) c.csConsistencyCheck() c.save() except ValidationError as e: c.delete() raise e else: if r.get('INCHI') is None: r['INCHI'] = '' if r.get('smiles') is None: r['smiles'] = '' c = Compound.objects.get_or_create( labGroup=l, custom=True, name=r['name'], abbrev=r['abbrev'], formula=r['formula'], smiles=r['smiles'], INCHI=r['INCHI'])[0] self.stdout.write(c.name.encode('utf-8')) c.save() with open(path.join( folder, 'compound_chemicalClasses.tsv')) as chemicalClasses: reader = csv.DictReader(chemicalClasses, delimiter='\t') for r in reader: self.stdout.write('working with class {}'.format( r['chemicalClass.label'])) cs = Compound.objects.filter(abbrev=r['compound.abbrev']) if cs.count() > 0: c1 = ChemicalClass.objects.get_or_create( label=r['chemicalClass.label'])[0] for c2 in cs: if c1 not in c2.chemicalClasses.all(): c2.chemicalClasses.add(c1) c2.save() with open(path.join(folder, 'compoundquantities.tsv')) as cqs: reader = csv.DictReader(cqs, delimiter='\t') for r in reader: try: reaction = PerformedReaction.objects.get( reference=r['reaction.reference'].lower()) compound = Compound.objects.get( abbrev=r['compound.abbrev'], labGroup=reaction.labGroup) if r['compound.abbrev'] in ('water', 'H2O'): r['density'] = 1 mw = NumMolDescriptorValue.objects.get( compound=compound, descriptor__heading='mw').value if r['compoundrole.name'] != 'pH': self.stdout.write('adding {} to {}'.format( compound.abbrev, reaction.reference)) compoundrole = CompoundRole.objects.get_or_create( label=r['compoundrole.name'])[0] if r['amount'] in ('', '?'): amount = None elif r['unit'] == 'g': amount = float(r['amount']) / mw elif r['unit'] == 'd': amount = float(r['amount']) * \ 0.0375 * float(r['density']) / mw elif r['unit'] == 'mL': amount = float(r['amount']) * \ float(r['density']) / mw else: raise RuntimeError('invalid unit entered') if amount is not None: amount = (amount * 1000) cqq = CompoundQuantity.objects.filter( role=compoundrole, compound=compound, reaction=reaction) if cqq.count() > 1: cqq.delete() quantity = CompoundQuantity.objects.get_or_create( role=compoundrole, compound=compound, reaction=reaction)[0] quantity.amount = amount quantity.save() else: reaction.notes += ' pH adjusting reagent used: {}, {}{}'.format( r['compound.abbrev'], r['amount'], r['unit']) reaction.save(calcDescriptors=False) except Compound.DoesNotExist as e: self.stderr.write( 'Unknown Reactant {} with amount {} {} in reaction {}'. format(r['compound.abbrev'], r['amount'], r['unit'], r['reaction.reference'])) raw_input("Continue?") reaction.notes += ' Unknown Reactant {} with amount {} {}'.format( r['compound.abbrev'], r['amount'], r['unit']) reaction.valid = False reaction.save(calcDescriptors=False) except PerformedReaction.DoesNotExist as e: raise e
# Base Python libraries import sys, os, json from types import * from distutils.dir_util import mkpath # NIH resolver interface import cirpy # PubChem interface import pubchempy as pcp # ChemSpider CST = os.environ['CHEMSPIDER_SECURITY_TOKEN'] from chemspipy import ChemSpider cs = ChemSpider(security_token=CST) def getJSON(inchikey): items = {} filename = 'json/%s/%s.json' % (inchikey[0:2], inchikey) try: with open(filename) as file: items = json.load(file) except IOError, ValueError: items["inchikey"] = inchikey # check if we need to get various keys # PubChem CID if not "pubchem_cid" in items: results = pcp.get_compounds(inchikey, 'inchikey') results.sort() pcpCmpd = results[0]
from django.utils.text import slugify from chemspipy import ChemSpider from professor_oak.models import ScoreMixin log = logging.getLogger(__name__) # Load the chemspider API for accessing the RSC structure database try: cs_key = settings.CHEMSPIDER_KEY except AttributeError: log.warn('CHEMSPIDER_KEY not found in localsettings.py') chemspider_api = None else: chemspider_api = ChemSpider(cs_key) class Hazard(models.Model): """A hazard type as defined by the global harmonized system. Attributes ---------- - pictogram : Image file that represents this image. If not provided, we will look in `static_files/ghs_pictograms/` for one that matches the `name` attribute """ PHYSICAL = 'p' HEALTH = 'h' PHYSICAL_AND_HEALTH = 'ph' ENVIRONMENTAL = 'e'
# Tool uses the ChemSpiPy library to assist in accessing the ChemSpider Database # Syntax to run command: python ChemSpider.py -(f/n) term # -f name -> get the formula for the common name formula # -n formula -> get the common name for the formula import sys # allows us to use command line arguments if len(sys.argv) < 3: print("Incorrect input.\n\t==> python ChemSpider.py [-f/-n] <argument>") sys.exit() from chemspipy import ChemSpider cs = ChemSpider('3e05e0a6-9f49-4dff-ba0e-a9d6ca3d04ea') # imports the ChemSpider api, and passes our access token to it for result in cs.search(sys.argv[2])[:5]: # Give the first five results for -f. if sys.argv[1] == "-f": print(result.common_name) print(result.molecular_formula) # print(result.common_name) if sys.argv[1] == "-n": print(result.common_name) break