def __init__(self): """ Initializes all the object variables """ # Reaction Dataframe self.reactions_dataframe = None # Reactant Dataframe self.species_df = None # Unique Reactants Dictionary self.unique_species_dict = None # Creating a transator for cleaning individual reactants off non-familiar characters self.translator = str.maketrans("Î", "α", "±€™") # Argument style # (# intab,outtab,character string that should be mapped to None) # Autheticating ChemSpider API using the token self.security_token = "99c9f388-12be-4b22-8f83-00b6f1e2d7d0" # Maneet's token self.cs = ChemSpider( self.security_token, user_agent="StudentResearcher, ChemSpiPy 1.0.5, Python 3.6") print('--Populator Initialized--')
def find_matches(matched_in_ChemSpider, massFile_Name): from chemspipy import ChemSpider cs = ChemSpider('dfdc677d-e7d3-435b-a74e-bfe6167a3899') for i in matched_in_ChemSpider.keys(): print i # intialiaztion matched_compounds = [] matches = {} # load mol file info of the product product_molFile = read_product_molFile(massFile_Name, i) # for each compound in data base with almost the same mass for CSID in matched_in_ChemSpider[i]: # extract the compound's mol file c = cs.get_compound(CSID) ChemSpider_compound_mol_info = c.mol_2d # compare the product's and compound's mol files is_the_same = compare_two_molFiles(product_molFile, ChemSpider_compound_mol_info) # add the compound to the list if it's molfile is the same as the product's if is_the_same: matched_compounds.append(CSID) # if at least one compound found as a match if matched_compounds != []: matches.update({i: matched_compounds}) # return the whole matches for products return matches
def get_image_url(self): md = jsonpickle.decode(self.metadata) if 'csid' in md: # If this doc already has a csid, make the url return 'http://www.chemspider.com/ImagesHandler.ashx?id=' + str( self.csid) elif 'InChIKey' in md or 'inchikey' in md: # If it doesnt but it does have an InChIKey get the csid and make the image url # this code doesn't work...due to an upgrade in chemspider # if you want images, get the mol from chemspipy import ChemSpider cs = ChemSpider(settings.CHEMSPIDER_APIKEY) ikey = md.get('InChIKey', md.get('inchikey')) results = cs.search(ikey) if results: # Return the image_url and also save the csid csid = results[0].csid md['csid'] = csid self.metadata = jsonpickle.encode(md) self.save() return results[0].image_url else: return None else: # If it has neither, no image! return None
def csConsistencyCheck(self): """Perform a consistency check of this record against chemspider. Raise a ValidationError on error.""" if not self.custom: errorList = [] cs = ChemSpider(settings.CHEMSPIDER_TOKEN) if self.CSID is None or self.CSID is '': raise ValidationError('No CSID set', 'no_csid') else: csCompound = cs.get_compound(self.CSID) if self.name not in ('', None): nameResults = cs.simple_search(self.name) if csCompound not in nameResults: errorList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid name', code='invalid_inchi')) else: self.name = csCompound.common_name if self.INCHI == '': self.INCHI = csCompound.stdinchi elif self.INCHI != csCompound.stdinchi: errorList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid InChi', code='invalid_inchi')) if self.smiles == '': self.smiles = csCompound.smiles elif self.smiles != csCompound.smiles: errorList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid smiles string', code='invalid_smiles')) if self.formula == '': self.formula = csCompound.molecular_formula elif self.formula != csCompound.molecular_formula: errorsList.append(ValidationError( 'A compound was consistency checked and was found to have an invalid formula', code="invalid_formula")) if len(errorList) > 0: raise ValidationError(errorList)
def clean_name(self): chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) nameResults = chemSpider.simple_search(self.cleaned_data['name']) if self.instance.CSID not in (nameResult.csid for nameResult in nameResults): raise ValidationError("That name is not a known synonym for this compound") else: return self.cleaned_data['name']
def set_and_initialize_token(self, input_token): """ Stores you ChemSpider security token as an object attribute and Associate your token to the ChemSpider api :param input_token: your security token (for ChemSpider) :return: None """ self.security_token = input_token self.cs = ChemSpider(self.security_token)
def __init__(self, user, *args, **kwargs): """Overridden version of the init method allows us to place the user's lab groups as a restricted set.""" super(CompoundForm, self).__init__(*args, **kwargs) self.compound = None self.chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) self.fields['labGroups'].queryset = user.labgroup_set.all() if user.labgroup_set.all().exists(): self.fields['labGroups'].empty_label = None
def clean_name(self): """Check the name is a valid synonym.""" chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) nameResults = chemSpider.simple_search(self.cleaned_data['name']) if self.instance.CSID not in (nameResult.csid for nameResult in nameResults): raise ValidationError( "That name is not a known synonym for this compound") else: return self.cleaned_data['name']
def search_by_mass(mass, margine): # pip install chemspipy from chemspipy import ChemSpider # register to generate a sequrity code cs = ChemSpider('dfdc677d-e7d3-435b-a74e-bfe6167a3899') # search the data base CSIDs = [] for result in cs.simple_search_by_mass(mass, margine): CSIDs.append(result.csid) return CSIDs
def getLongNames(molsDict, pref=4, onlyLettersDigits=False, token="2228d430-a955-416b-b920-14547d28df9e"): cs = ChemSpider(token) names = {} for mol in molsDict: comp = cs.get_compound(mol[pref:]) nName = comp.common_name.encode("ascii", "ignore") if onlyLettersDigits: nName = leftOnlyLettersDigits(nName) names[mol] = nName return names
class ChemSp(object): def __init__(self): sett = SettingsConstants() self.key = sett.get('CHEMSPI_KEY') self.url = sett.get('CHEMSPI_API_URL') self.cs = ChemSpider(self.key, api_url=self.url) def get_cmpd(self, csid): return self.cs.get_compound(csid) def search(self, query): print('Connected to ChemSpider API') print("Searching started") print("Searching for: " + query) i = 0 results = [] for result in self.cs.search(query): if i > 5: break print("Compound " + str(i)) formula = str(result.molecular_formula) csid = str(result.csid) inchi = result.inchi name = result.common_name cas = cirpy.resolve(inchi, 'cas') iupac_name = cirpy.resolve(inchi, 'iupac_name') if type(cas) is list: c_cas = query sim_cas = difflib.get_close_matches(str(c_cas), cas, 3, 0) print(sim_cas) cas_ = sim_cas[0] else: cas_ = cas image = result.image_url print(image) i = i + 1 result_line = {'csid': csid, 'name': name, 'iupac_name': iupac_name, 'cas': cas_, 'inchi': inchi, \ 'formula': formula, 'image': image} results.append(result_line) print("Searching finished") print(results) return results def render_image(self, csid, image_id): image_png = self.get_cmpd(csid).image temp_image = '/home/marcin/Dokumenty/projekty/production/Chem/chembase/static/chembase/temp/temp' + image_id + '.png' with open(temp_image, 'wb+') as destination: destination.write(image_png) image_path = '/static/chembase/temp/temp' + image_id + '.png?timestamp=' + str( datetime.datetime.now()) return image_path
def fromCsv(self, fileName, labGroup=None): """Read a CSV into the creating objects, returning a list of compounds which have not yet been saved. This assumes that the uploaded csv will have headers which map to the names of the fields and that compound classes are stored as comma separated lists of the chemicalClass LABEL only. Each compound will perform a chemspider-based consistency check on the information it has been created with to ensure information is consistent- this throws an ValidationError if it is not. """ if labGroup is None and hasattr(self, 'instance'): # we presume that if this is being called without a labgroup that's because this manager belongs to a lab group labGroup = self.instance compoundsList = [] cs = ChemSpider(settings.CHEMSPIDER_TOKEN) with open(fileName) as f: reader = csv.DictReader(f, restkey='restKey') rowCount = 0 errors = [] for row in reader: try: rowCount += 1 if 'chemicalClasses' in row: classes = (c.strip() for c in row['chemicalClasses'].split(',')) chemicalClasses = [] for c in classes: chemicalClass, created = ChemicalClass.objects.get_or_create(label=c) chemicalClasses.append(chemicalClass) if row.get('CAS') not in ('', None) and row.get('CSID') in ('', None): CASResults = cs.simple_search(row['CAS']) if len(CASResults) < 1: errors.append(ValidationError('CAS Number returned no results from ChemSpider on row %(rowCount)d of uploaded csv.', params={'rowCount': rowCount})) elif len(CASResults) == 1: row['CSID'] = CASResults[0].csid # a little hacky, but it gets the job done else: errors.append(ValidationError('CAS number returns more than one ChemSpider ID on row %(rowCount)d of uploaded csv.', params={'rowCount': rowCount})) elif row.get('CSID') in ('', None): errors.append(ValidationError('No CSID provided on row %(rowCount)d of uploaded csv.', params={'rowCount': rowCount})) kwargs = {} kwargs['CSID'] = row.get('CSID') kwargs['abbrev'] = row.get('abbrev') kwargs['smiles'] = row.get('smiles') kwargs['name'] = row.get('name') kwargs['INCHI'] = row.get('INCHI') compound = Compound(labGroup=labGroup, **kwargs) for chemicalClass in chemicalClasses: compound.lazyChemicalClasses.append(chemicalClass) compoundsList.append(compound) except ValidationError as e: for message in e.messages: errors.append(ValidationError(message + ' on row %(rowCount)d of uploaded csv', params={'rowCount': rowCount})) if len(errors) > 0: raise ValidationError(errors) return compoundsList
def find_common_name(inchikey, formula): # Try to find the common name for the compound, if not use the formula. name = formula if chemspikey: cs = ChemSpider(chemspikey) if (len(inchikey) > 0): result = cs.search(inchikey) if (len(result) == 1): name = result[0].common_name return name
def find_common_name(inchikey): # Try to find the common name for the compound, if not, return None. name = None if chemspikey: cs = ChemSpider(chemspikey) if (len(inchikey) > 0): result = cs.search(inchikey) if (len(result) == 1): name = result[0].common_name return name
def structure_url(self): from chemspipy import ChemSpider try: cs_key = settings.CHEMSPIDER_KEY except AttributeError: url = 'http://discovermagazine.com/~/media/Images/Zen%20Photo/N/nanoputian/3487.gif' else: cs = ChemSpider(cs_key) IUPAC = self.name search_results = cs.simple_search(IUPAC) try: url = search_results[0].image_url except IndexError: url = "" return url
def database_setup(): """ Download 2D & 3D molecule structure from ChemSpider server to create a database """ from chemspipy import ChemSpider # compile id list for calling molecules id_list = get_id() directory = DATABASE # make directory database_chemspider/ if needed if os.path.isdir(directory): print('Database folder already existed! Aborting... \n ' 'Please remove the folder and rerun') exit() else: os.mkdir(directory) print('downloading..') os.chdir(directory) # change dir to database_chemspider/ # access API key cs = ChemSpider('text') # go through each id for id_chemspider in id_list: if os.path.exists(str(id_chemspider) + '_2d.txt'): # pass if id already exist print('ID ' + str(id_chemspider) + ' already existed') continue # access molecule data c = cs.get_compound(id_chemspider) # write 2d coord and bond data f = open(str(id_chemspider) + '_2d.txt', 'w') f.write(c.mol_2d) f.close() # write 3d coord and bond data f = open(str(id_chemspider) + '_3d.txt', 'w') f.write(c.mol_3d) f.close() os.chdir('../')
def __init__(self, user, *args, **kwargs): """Overridden version of the init method allows us to place the user's lab groups as a restricted set.""" super(CompoundForm, self).__init__(*args, **kwargs) self.compound = None self.chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) self.fields['labGroup'].queryset = user.labgroup_set.all() if user.labgroup_set.all().exists(): self.fields['labGroup'].empty_label = None
def index(req): if req.method == "POST": response = HttpResponse # response["Access-Control-Allow-Origin"] = "*" print "POST METHOD" body = json.loads(req.body) print body print body["chemType"] chemType = body["chemType"] values = body["value"] print chemType == "compound" if chemType == "compound": print "in this function" CS = ChemSpider(security_token) current_chem_symbol = CS.search(body) print current_chem_symbol print current_chem_symbol[0].common_name returned_responses = [formulaToLatex(c.molecularFormula) for c in current_chem_symbol] if len(returned_responses) > 4: returned_responses = returned_responses[:4] return JSONResponse({"latex": str(returned_resposnes)}) elif chemType == "element": name = body["name"] print "arrived at function" for i in range(4): print values[i] print name print textToLatex(name, values[0], values[1], values[2], values[3]) # return JSONResponse({"latex": textToLatex(name, values[0], values[1], values[2], values[3])}) return HttpResponse( textToLatex(name, values[0], values[1], values[2], values[3]), content_type="text/plain" ) else: return else: template = loader.get_template("chemInterpreter/index.html") return render(req, "chemInterpreter/index.html")
def get_chemspider_structure(csid): """ Get a molecular structure from ChemSpider, generate a PDB file of the structure, and return the name of the PDB file """ pdbpath = '{}.pdb'.format(csid) token = 'a03b1636-afc3-4204-9a2c-ede27680577c' # XXX cs = ChemSpider(token) cmpd = cs.get_compound(csid) conv = ob.OBConversion() conv.SetInAndOutFormats('mol', 'pdb') mol = ob.OBMol() conv.ReadString(mol, cmpd.mol_3d) mol.AddHydrogens() with open(pdbpath, 'w') as f: f.write(conv.WriteString(mol)) return pdbpath
def getChemspiderCompounds(token, list, pref, delim="_", longNames=True, onlyLettersDigits=False): cs = ChemSpider(token) names = [] molecules = [] for chsId in list: comp = cs.get_compound(chsId) name = pref + delim + str(chsId) if longNames: name += delim sn = comp.common_name.encode("ascii", "ignore") if onlyLettersDigits: sn = leftOnlyLettersDigits(sn) name += sn # .replace('(', '_').replace(')', '_').replace('[', '_').replace(']', '_').replace(',', '_').replace(' ', '_').replace(';', '_')[:25] print(name) smiles = comp.smiles.encode("ascii", "ignore") mol = Chem.MolFromSmiles(smiles) mol2 = Chem.AddHs(mol) molecules.append(mol2) names.append(name) return molecules, names
def get_image_url(self): md = jsonpickle.decode(self.metadata) if 'csid' in md: # If this doc already has a csid, make the url return 'http://www.chemspider.com/ImagesHandler.ashx?id=' + str(self.csid) elif 'InChIKey' in md: # If it doesnt but it does have an InChIKey get the csid and make the image url from chemspipy import ChemSpider cs = ChemSpider('b07b7eb2-0ba7-40db-abc3-2a77a7544a3d') results = cs.search(md['InChIKey']) if results: # Return the image_url and also save the csid csid = results[0].csid md['csid'] = csid self.metadata = jsonpickle.encode(md) self.save() return results[0].image_url else: return None else: # If it has neither, no image! return None
def get_image_url(self): md = jsonpickle.decode(self.metadata) if 'csid' in md: # If this doc already has a csid, make the url return 'http://www.chemspider.com/ImagesHandler.ashx?id=' + str(self.csid) elif 'InChIKey' in md or 'inchikey' in md: # If it doesnt but it does have an InChIKey get the csid and make the image url from chemspipy import ChemSpider cs = ChemSpider('b07b7eb2-0ba7-40db-abc3-2a77a7544a3d') ikey = md.get('InChIKey',md.get('inchikey')) results = cs.search(ikey) if results: # Return the image_url and also save the csid csid = results[0].csid md['csid'] = csid self.metadata = jsonpickle.encode(md) self.save() return results[0].image_url else: return None else: # If it has neither, no image! return None
def smiles2cas(smiles_input): myToken = 'a1d50aa3-6729-49df-a3e1-cd66240fab22' cs = ChemSpider(security_token=myToken) comp = cs.search(smiles_input) for result in comp: temp = result res = temp.csid res = str(res) http = requests.session() url = 'http://www.chemspider.com/MassSpecApi.asmx/GetExtendedCompoundInfoArray' params = {'token': myToken} http.post(url, data=params) url_search = 'http://www.chemspider.com/Search.aspx?q=' + res r = http.get(url_search) soup = bs4.BeautifulSoup(r.text, "html.parser") cas = [a.attrs.get('href') for a in soup.select('div.syn a[title="RN"]')] for x in range(len(cas)): cas[x] = re.findall(r"\"(.+?)\"", cas[x]) return (cas)
class SpiderRecovery: def __init__(self, chemspider_api_key: str): self._cs = ChemSpider(chemspider_api_key) self._cs = None self._has_stero = re.compile( '(?:\([RSrsEZez+\-]\))|(?:[RSrsEZez][- \(])') def recover_spider(self, name: str) -> Optional[str]: """Makes a best-effort attempt to recover SMILES strings from compound names unambiguously by searching ChemSpider. Errs slightly on the side of failure. If the compound name doesn't contain R, S, E, or Z (case-insensitive) in parantheses or followed by a hyphen or space, assumes the compound has no defined sterocenters. In other words, it assumes minimal sterochemistry. Returns the SMILES string if it was found unambiguously; otherwise returns None. """ results = self._cs.search(name) if len(results) == 1: return results[0].smiles elif len(results) > 0: # try to recover if they're just enantiomers connectivities = {result.inchikey[0:14] for result in results} if len(connectivities) == 1: if self._has_stero.match(name) is None: no_sterocenters = { result.smiles for result in results if '@' not in result.smiles and '/' not in result.smiles and '\\' not in result.smiles } if len(no_sterocenters) == 1: return next(iter(no_sterocenters)) elif len(no_sterocenters) > 1: warnings.warn( "There are somehow {} compounds with the same connectivity and no defined sterocenters for {}" .format(len(no_sterocenters), name)) return None # give up def recover_spiders( self, names: Iterable[str], sleep_seconds: float = 0.1) -> Iterator[Tuple[str, str]]: """Yields a SMILES string each time one is found. Returns a tuple of (name, smiles), which can be made into a dict.""" for name in names: smiles = self.recover_spider(name) time.sleep(sleep_seconds) # don't annoy the admins! if smiles is not None: yield name, smiles
class ChemspiderSearcher: def __init__(self, api_key: str): self.cs = ChemSpider(api_key) def chemspider_names( self, names: Iterable[str], partial_dict: Mapping[str, chemspipy.objects.Compound] = {}, sleep_secs_between: float = 0.1 ) -> Mapping[str, chemspipy.objects.Compound]: """Build a dictionary mapping compound names to unique ChemSpider hits as chemspipy.objects.Compound objects, using partial_dict as a starting point. Does not modify partial_dict. Warns for each compound that has multiple or no hits. Immediately pickling the fetched results may be a good idea. Example usage: for compounds in chemspider_names(['Trichostatin A', 'Oxamflatin', 'Vinblastine']): print("{} → {}".format(result.csid, result.smiles)) Result: UserWarning: Multiple (2) hits found for Oxamflatin 392575 → C[C@H](/C=C(\C)/C=C/C(=O)NO)C(=O)c1ccc(cc1)N(C)C 12773 → CC[C@@]1(C[C@H]2C[C@@](c3c(c4ccccc4[nH]3)CCN(C2)C1)(c5cc6c(cc5OC)N([C@@H]7[C@]68CCN9[C@H]8[C@@](C=CC9)([C@H]([C@@]7(C(=O)OC)O)OC(=O)C)CC)C)C(=O)OC)O """ def fetch(name: str) -> Optional[chemspipy.objects.Compound]: results = [] for result in self.cs.search(name): # blocks results.append(result) if len(results) == 0: warnings.warn("No results found for {}".format(name)) elif len(results) > 1: warnings.warn('Multiple ({}) hits found for {}'.format( len(results), name)) else: return results[0] new_dict = partial_dict.copy() for name in set(names) - set(new_dict.keys()): got = fetch(name) time.sleep(sleep_secs_between) if got is not None: new_dict[name] = got return new_dict
from chemspipy import ChemSpider cs = ChemSpider('c48d4595-ead2-40e7-85c9-1e5d2a77754c') def get_chem(query): chem = None results = cs.search(query) if results: name = results[0].common_name smiles = results[0].smiles chem = {'name': name, 'smiles': smiles} return chem def get_smiles(query): chem = None results = cs.search(query) if results: smiles = results[0].smiles return smiles else: return None
import CoolProp from chemspipy import ChemSpider from chemspipy_key import key # private file with the key (DO NOT COMMIT!!) import glob, json cs = ChemSpider(key) # Map from name to Chemspider ID backup_map = { 'Propyne': 6095, 'R236EA': 71342, 'R245ca': 62827, 'trans-2-Butene': 56442, 'Oxygen': 952, 'Fluorine': 22932, 'Hydrogen': 762, 'Deuterium': 22931, 'HFE143m': 66577, 'SulfurHexafluoride': 16425, 'R114': 13853215 } # Make sure the key works c = cs.get_compound(2157) assert (c.inchikey == 'BSYNRYMUTXBXSQ-UHFFFAOYAW') for fname in glob.glob('../fluids/*.json'): with open(fname, 'r') as fp: jj = json.load(fp) fluid = jj['INFO']['NAME']
def process(self, input_text: str = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, input_type: str = "", lang: str = "eng", paged_text: bool = False, format_output: bool = True, opsin_types: list = None, standardize_mols: bool = True, convert_ions: bool = True, write_header: bool = True, iob_format: bool = False, dry_run: bool = False, csv_delimiter: str = ";", normalize_text: bool = True, remove_duplicates: bool = False, annotate: bool = True, annotation_sleep: int = 2, chemspider_token: str = "", continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with ChemSpot. Parameters ---------- input_text : str String to be processed by ChemSpot. input_file : str Path to file to be processed by ChemSpot. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. SDF is from OPSIN converted entities. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. SDF is from OPSIN converted entities. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf", "pdf_scan", "image" or "text" and magic bytes check will be skipped. lang : str | Language which will Tesseract use for OCR. Available languages: https://github.com/tesseract-ocr/tessdata | Multiple languages can be specified with "+" character, i.e. "eng+bul+fra". paged_text : bool If True and `input_type` is "text" or `input_text` is provided, try to assign pages to chemical entities. ASCII control character 12 (Form Feed, '\f') is expected between pages. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. opsin_types : list | List of ChemSpot entity types. Entities of types in this list will be converted with OPSIN. If you don't want to convert entities, pass empty list. | OPSIN is designed to convert IUPAC names to linear notation (SMILES etc.) so default value of `opsin_types` is ["SYSTEMATIC"] (these should be only IUPAC names). | ChemSpot entity types: "SYSTEMATIC", "IDENTIFIER", "FORMULA", "TRIVIAL", "ABBREVIATION", "FAMILY", "MULTIPLE" standardize_mols : bool If True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules converted by OPSIN. convert_ions : bool If True, try to convert ion entities (e.g. "Ni(II)") to SMILES. Entities matching ion regex won't be converted with OPSIN. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header: "smiles", "bond_length", "resolution", "confidence", "learn", "page", "coordinates" iob_format : bool If True, output will be in IOB format. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. normalize_text : bool If True, normalize text before performing NER. It is strongly recommended to do so, because without normalization can ChemSpot produce unpredictable results which cannot be parsed. remove_duplicates : bool If True, remove duplicated chemical entities. Note that some entities-compounds can have different names, but same notation (SMILES, InChI etc.). This will only remove entities with same names. Not applicable for IOB format. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for entity name, SMILES etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. | If textual entity has single result in DB when searched by name, fill in missing identifiers (SMILES etc.). annotation_sleep: int How many seconds to sleep between annotation of each entity. It's for preventing overloading of databases. chemspider_token : str Your personal token for accessing the ChemSpider API (needed for annotation). Make account there to obtain it. continue_on_failure : bool | If True, continue running even if ChemSpot returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from ChemSpot - stderr: str ... standard error output from ChemSpot - exit_code: int ... exit code from ChemSpot - content - list of OrderedDicts ... when `format_output` is True - None ... when `format_output` is False - normalized_text : str """ if opsin_types is None: opsin_types = ["SYSTEMATIC"] if input_text and input_file: input_file = "" self.logger.warning("Both 'input_text' and 'input_file' are set, but 'input_text' will be prefered.") elif not input_text and not input_file: raise ValueError("One of 'input_text' or 'input_file' must be set.") if not input_type and not input_text: possible_input_types = ["pdf", "image", "text"] input_type = get_input_file_type(input_file) if input_type not in possible_input_types: raise ValueError("Input file type ({}) is not one of {}".format(input_type, possible_input_types)) elif input_type and not input_text: possible_input_types = ["pdf", "pdf_scan", "image", "text"] if input_type not in possible_input_types: raise ValueError("Unknown 'input_type'. Possible 'input_type' values are {}".format(possible_input_types)) if input_type in ["pdf", "pdf_scan", "image"]: input_text, _ = get_text(input_file, input_type, lang=lang, tessdata_prefix=os.environ["TESSDATA_PREFIX"]) input_file = "" if annotate and not chemspider_token: self.logger.warning("Cannot perform annotation in ChemSpider: 'chemspider_token' is empty.") options = ChainMap({k: v for k, v in {"iob_format": iob_format}.items() if v}, self.options_internal) output_file_temp = None commands, _, _ = self.build_commands(options, self._OPTIONS_REAL, self.path_to_binary) commands.insert(1, str(self.options_internal["max_memory"])) commands.append("-t") if normalize_text: normalizer = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, slashes=True, tildes=True, ellipsis=True) if input_file: with open(input_file, mode="r") as f: input_text = f.read() input_text = normalizer(input_text) if not input_text: raise UserWarning("'input_text' is empty after normalization.") input_text = self.normalize_text(text=input_text) input_file_normalized = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_normalized.write(input_text) input_file_normalized.flush() input_file = input_file_normalized.name else: if input_text: input_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") input_file_temp.write(input_text) input_file_temp.flush() input_file = input_file_temp.name commands.append(os.path.abspath(input_file)) commands.append("-o") if format_output: output_file_temp = NamedTemporaryFile(mode="w", encoding="utf-8") commands.append(os.path.abspath(output_file_temp.name)) else: commands.append(os.path.abspath(output_file)) if dry_run: return " ".join(commands) stdout, stderr, exit_code = common_subprocess(commands) if "OutOfMemoryError" in stderr: raise RuntimeError("ChemSpot memory error: {}".format(stderr)) to_return = {"stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None, "normalized_text": input_text if normalize_text else None} if not continue_on_failure and exit_code > 0: self.logger.warning("ChemSpot error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if normalize_text: to_return["normalized_text"] = input_text if not format_output: return to_return elif format_output: with open(output_file_temp.name, mode="r", encoding="utf-8") as f: output_chs = f.read() entities = self.parse_chemspot_iob(text=output_chs) if iob_format else self.parse_chemspot(text=output_chs) to_return["content"] = entities if remove_duplicates and not iob_format: seen = set() seen_add = seen.add to_return["content"] = [x for x in to_return["content"] if not (x["entity"] in seen or seen_add(x["entity"]))] if input_type in ["pdf", "pdf_scan"] or paged_text: page_ends = [] for i, page in enumerate(input_text.split("\f")): if page.strip(): try: page_ends.append(page_ends[-1] + len(page) - 1) except IndexError: page_ends.append(len(page) - 1) if opsin_types: if convert_ions: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types and not self.re_ion.match(x["entity"])] else: to_convert = [x["entity"] for x in to_return["content"] if x["type"] in opsin_types] if to_convert: opsin = OPSIN(verbosity=self.verbosity) opsin_converted = opsin.process(input=to_convert, output_formats=["smiles", "inchi", "inchikey"], standardize_mols=standardize_mols, output_file_sdf=output_file_sdf, sdf_append=sdf_append) opsin_converted = iter(opsin_converted["content"]) else: self.logger.info("Nothing to convert with OPSIN.") if annotate: chemspider = ChemSpider(chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): if input_type in ["pdf", "pdf_scan"] or paged_text: ent["page"] = str(bisect.bisect_left(page_ends, int(ent["start"])) + 1) if convert_ions: match_ion = self.re_ion.match(ent["entity"]) if match_ion: match_ion = match_ion.groupdict() match_charge = self.re_charge.search(match_ion["charge"]) if match_charge: match_charge = match_charge.groupdict() if match_charge["roman"]: smiles = "[{}+{}]".format(match_ion["ion"], len(match_charge["roman"])) elif match_charge["digit"]: if "+" in match_ion["charge"]: smiles = "[{}+{}]".format(match_ion["ion"], match_charge["digit"]) elif "-" in match_ion["charge"]: smiles = "[{}-{}]".format(match_ion["ion"], match_charge["digit"]) elif match_charge["signs"]: smiles = "[{}{}{}]".format(match_ion["ion"], match_charge["signs"][0], len(match_charge["signs"])) mol = MolFromSmiles(smiles) if mol: inchi = MolToInchi(mol) if inchi: ent.update(OrderedDict( [("smiles", smiles), ("inchi", inchi), ("inchikey", InchiToInchiKey(inchi))])) else: ent.update(OrderedDict([("smiles", smiles), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) else: ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", "")])) if opsin_types and to_convert: if ent["entity"] in to_convert: ent_opsin = next(opsin_converted) ent.update(OrderedDict([("smiles", ent_opsin["smiles"]), ("inchi", ent_opsin["inchi"]), ("inchikey", ent_opsin["inchikey"]), ("opsin_error", ent_opsin["error"])])) elif convert_ions and self.re_ion.match(ent["entity"]): ent.update(OrderedDict([("opsin_error", "")])) elif (convert_ions and not self.re_ion.match(ent["entity"])) or (not convert_ions and ent["entity"] not in to_convert): ent.update(OrderedDict([("smiles", ""), ("inchi", ""), ("inchikey", ""), ("opsin_error", "")])) # TODO: this should be simplified...looks like garbage code if annotate: self.logger.info("Annotating entity {}/{}...".format(i + 1, len(to_return["content"]))) ent.update(OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_name", ""), ("chs_cids_by_name", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_cids_by_formula", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) # do "double-annotation": some entities can be found in only one DB, updated and then searched in second DB found_in_pch = False found_in_chs = False for _ in range(2): results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search(ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_cids_by_inchikey"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) else: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results = get_compounds(ent["entity"] or ent["abbreviation"], "name") if results: if len(results) == 1: found_in_pch = True result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format("\",\"".join(synonyms)) # only update identifiers if they weren't found in second DB if not found_in_chs: ent["smiles"] = result.canonical_smiles or ent["smiles"] ent["inchi"] = result.inchi or ent["inchi"] ent["inchikey"] = result.inchikey or ent["inchikey"] ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_name"] = "\"{}\"".format(",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results = chemspider.search(ent["entity"] or ent["abbreviation"]) if chemspider_token else [] if results: if len(results) == 1: found_in_chs = True result = results[0] if not found_in_pch: ent["smiles"] = result.smiles or ent["smiles"] ent["inchi"] = result.stdinchi or ent["inchi"] ent["inchikey"] = result.stdinchikey or ent["inchikey"] ent["chs_common_name"] = result.common_name ent["chs_cids_by_name"] = "\"{}\"".format(",".join([str(c.csid) for c in results])) for search_field, col_pch, col_chs in [("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi"), ("formula", "pch_cids_by_formula", "")]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent["smiles"] and "*" not in ent["smiles"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent["inchi"]: if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass if (not found_in_pch and not found_in_chs) or (found_in_pch and not found_in_chs): results_chs = chemspider.search(ent["inchi"]) if chemspider_token else [] elif search_field == "formula": if (not found_in_pch and not found_in_chs) or (not found_in_pch and found_in_chs): try: results_pch = get_compounds(ent["entity"], "formula") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass # ChemSpider doesn't have search field for 'formula' if results_pch: ent[col_pch] = "\"{}\"".format(",".join([str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join([str(c.csid) for c in results_chs])) sleep(0.5) sleep(annotation_sleep) if not found_in_pch and not found_in_chs: break if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) return to_return
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException from dotenv import load_dotenv load_dotenv() DISCORD_TOKEN = os.getenv('DISCORD_TOKEN') CHEMSPIDER_TOKEN = os.getenv('CHEMSPIDER_TOKEN') WOLFRAM_TOKEN = os.getenv('WOLFRAM_TOKEN') cs = ChemSpider(CHEMSPIDER_TOKEN) wolfram = wolframalpha.Client(WOLFRAM_TOKEN) client = commands.Bot(command_prefix='!') op = webdriver.ChromeOptions() op.binary_location = os.getenv('GOOGLE_CHROME_BIN') op.add_argument('--headless') op.add_argument('--no-sandbox') op.add_argument('--disable-dev-sh-usage') driver = webdriver.Chrome(executable_path=os.getenv('CHROMEDRIVER_PATH'), chrome_options=op) # for local testing purposes only; comment out when deployed to Heroku #driver = webdriver.Firefox()
class CompoundForm(forms.ModelForm): """ A form for users to add compounds to the compound guide. Forces a check against the chemspider database to ensure no spurious compounds make their way into the compound guide. """ CAS_ID = forms.CharField(label='CAS ID', required=False) """Adding this field, not in the database, allows users to match compounds to a CAS_ID without us incuring issues for storing them.""" CSID = forms.IntegerField(label='Chemspider ID', min_value=1, error_messages={ 'required': 'This value must be set or selected'}) """If the user already knows the right value for this it allows them to skip a step.""" class Meta: fields = ('labGroup', 'abbrev', 'CSID', 'name', 'CAS_ID', 'chemicalClasses') model = Compound help_texts = { 'abbrev': 'A local abbreviation by which the compound is known.', 'name': 'A common or IUPAC name for the compound.', 'CAS_ID': 'The CAS number for the compound. Optional.', 'CSID': 'The Chemspider ID for the compound. If this is not included, a list will be provided for you to choose from.' } def __init__(self, user, *args, **kwargs): """Overridden version of the init method allows us to place the user's lab groups as a restricted set.""" super(CompoundForm, self).__init__(*args, **kwargs) self.compound = None self.chemSpider = ChemSpider(settings.CHEMSPIDER_TOKEN) self.fields['labGroup'].queryset = user.labgroup_set.all() if user.labgroup_set.all().exists(): self.fields['labGroup'].empty_label = None def clean_CSID(self): """Check that the CSID is actually a valid id from chemspider.""" searchResults = self.chemSpider.simple_search( self.cleaned_data['CSID']) if(len(searchResults) < 1): raise ValidationError( 'The CSID you have provided is invalid', code='invalid_csid') else: self.compound = searchResults[0] return self.cleaned_data['CSID'] def clean(self): """Verify that the CSID, CAS_ID (where supplied) and name are consistent.""" self.cleaned_data = super(CompoundForm, self).clean() if self.cleaned_data.get('name'): nameResults = self.chemSpider.simple_search( self.cleaned_data['name']) if self.cleaned_data.get('CAS_ID') != '': CAS_IDResults = self.chemSpider.simple_search( self.cleaned_data['CAS_ID']) compoundChoices = [ compound for compound in nameResults if compound in CAS_IDResults][0:10] # the CAS_ID always generates a more restrictive set else: compoundChoices = nameResults[0:10] # if the CAS_ID is not supplied, then we just create a subset # based on the name search alone if self.compound is None and len(compoundChoices) > 0: self.fields['CSID'] = forms.ChoiceField(choices=( (choice.csid, choice.common_name) for choice in compoundChoices), widget=forms.widgets.RadioSelect) # in essence, if a CSID was not supplied, but the chemspider # search returned chemspider results, then we offer those # results to the user to make a selection. return self.cleaned_data elif self.compound is None: raise ValidationError( 'Your search terms failed to validate against the Chemspider database. Please contact a local administrator.', code='no_compounds') else: if self.compound not in nameResults: raise ValidationError( 'The name provided was not valid for the CSID provided. Please change the entry, or contact your local administrator.', code='name_csid_conflict') elif self.cleaned_data.get('CAS_ID') and self.compound not in CAS_IDResults: raise ValidationError( 'The CAS ID provided is not valid for the CSID provided. Remove, replace, or contact your local administrator.', 'name_cas_id_conflict') else: return self.cleaned_data else: if self.compound is not None: # this is probably some of the most horrible code I have # written, but it is the only way to get this to work - Phil. data = self.data.copy() # because otherwise the query dict is immutable # replace the data directly, as bad as that is... data['name'] = self.compound.common_name # manually input an error message which is less demanding (this # is actually canonical method) self._errors['name'] = self.error_class( ['Please review this suggestion']) self.data = data # override the old data return self.cleaned_data def save(self, commit=True): """Create (and if appropriate, saves) the compound instance, and adds Inchi and smiles from chemspider.""" compound = super(CompoundForm, self).save(commit=False) csCompound = self.chemSpider.get_compound(compound.CSID) compound.INCHI = csCompound.inchi compound.smiles = csCompound.smiles compound.formula = csCompound.molecular_formula if commit: compound.save() self.save_m2m() return compound
def __init__(self,argv): ''' load input arguments''' self.CAS_file = argv[1] ''' my chemsphder token ''' self.cs=ChemSpider('d1778a9f-c41f-41f6-920e-fc6d9ff739ca')
def handle(self, *args, **kwargs): """Handle the command call.""" folder = kwargs['directory'] start_at_reactions = kwargs['reactions'] start_at_descriptors = kwargs['descriptors'] start_at_quantities = kwargs['quantities'] start_number = kwargs['start_number'] delete_all = kwargs['delete_all'] no_compound_prompts = kwargs['no_compound_prompts'] start_at_delete = not ( start_at_reactions or start_at_descriptors or start_at_quantities) if start_at_delete: self.stdout.write('Deleting reactions') if delete_all: PerformedReaction.objects.all().delete() else: with transaction.atomic(): with open(path.join(folder, 'performedReactions.tsv')) as reactions: reader = csv.DictReader(reactions, delimiter='\t') for i, r in enumerate(reader): if start_at_delete and i < start_number: continue ref = convert_legacy_reference(r['reference']) legacyID = r['id'] ps = PerformedReaction.objects.filter( reference=ref) if ps: self.stdout.write( '{}: Deleting reaction with reference {}'.format(i, ref)) ps.delete() ps = PerformedReaction.objects.filter( reference=ref.lower()) if ps: self.stdout.write( '{}: Deleting reaction with converted legacy reference {}'.format(i, ref)) ps.delete() ps = PerformedReaction.objects.filter( convertedLegacyRef=ref) if ps: self.stdout.write( '{}: Deleting reaction with converted legacy reference {}'.format(i, ref)) ps.delete() ps = PerformedReaction.objects.filter( legacyID=legacyID) if ps: self.stdout.write( '{}: Deleting reaction with legacy id {}'.format(i, legacyID)) ps.delete() if start_at_reactions or start_at_delete: warnings.simplefilter('error') with open(path.join(folder, 'performedReactions.tsv')) as reactions: self.stdout.write('Creating reactions') reader = csv.DictReader(reactions, delimiter='\t') for i, r in enumerate(reader): if start_at_reactions and i < start_number: continue ref = convert_legacy_reference(r['reference']) convertedLegacyRef = ref ps = PerformedReaction.objects.filter( convertedLegacyRef=convertedLegacyRef) if ps.exists(): ref = '{}_{}'.format(ref, r['id']) valid = False notes = r[ 'notes'] + ' Duplicate reference disambiguated with legacy id.' for p in ps: if p.convertedLegacyRef == p.reference: p.valid = False p.notes += u' Duplicate reference disambiguated with legacy id.' p.reference = '{}_{}'.format( p.convertedLegacyRef, p.legacyID) p.save(calcDescriptors=False) else: valid = bool(int(r['valid'])) notes = r['notes'] p = PerformedReaction( reference=ref, legacyRef=r['reference'], convertedLegacyRef=convertedLegacyRef, labGroup=LabGroup.objects.get( title=r['labGroup.title']), legacyID=r['id'], notes=notes, user=User.objects.get(username=r['user.username']), valid=valid, legacyRecommendedFlag=( r['legacyRecommendedFlag'] == 'Yes'), insertedDateTime=r['insertedDateTime'], public=int(r['public']) ) self.stdout.write( '{}: Creating reaction with reference {}'.format(i, ref)) p.full_clean() p.save(calcDescriptors=False) if start_at_delete or start_at_reactions or start_at_descriptors: with open(path.join(folder, 'performedReactions.tsv')) as reactions: self.stdout.write('Creating manual descriptors') reader = csv.DictReader(reactions, delimiter='\t') outValues = [] outBoolValues = [] purityValues = [] temperatureValues = [] timeValues = [] pHValues = [] preHeatStandingValues = [] teflonValues = [] slowCoolValues = [] leakValues = [] for i, r in enumerate(reader): if start_at_descriptors and i < start_number: continue ref = convert_legacy_reference(r['reference']) id = r['id'] self.stdout.write( '{}: Reiterating for reaction with reference {}, legacyID {}'.format(i, ref, id)) p = PerformedReaction.objects.get(legacyID=id) if r['duplicateOf.reference']: convertedDupRef = convert_legacy_reference( r['duplicateOf.reference']) try: p.duplicateOf = PerformedReaction.objects.get( convertedLegacyRef=convertedDupRef) p.save(calcDescriptors=False) except PerformedReaction.DoesNotExist: self.stderr.write('Reaction {} marked as duplicate of reaction {}, but the latter does not exist'.format( ref, r['duplicateOf.reference'])) p.notes += 'Marked as duplicate of reaction with legacy reference {}, but it does not exist'.format(r[ 'duplicateOf.reference']) p.valid = False p.save(calcDescriptors=False) except PerformedReaction.MultipleObjectsReturned: self.stderr.write('Reaction {} marked as duplicate of reaction {}, but more than one of the latter exists'.format( ref, r['duplicateOf.reference'])) p.notes += 'Marked as duplicate of reaction with legacy reference {}, but more than one reaction with that reference exists'.format(r[ 'duplicateOf.reference']) p.valid = False p.save(calcDescriptors=False) outcomeValue = int(r['outcome']) if ( r['outcome'] in (str(x) for x in range(1, 5))) else None try: v = OrdRxnDescriptorValue.objects.get( descriptor=outcomeDescriptor, reaction=p) if v.value != outcomeValue: v.value = outcomeValue v.save() except OrdRxnDescriptorValue.DoesNotExist: outValue = outcomeDescriptor.createValue( p, outcomeValue) outValues.append(outValue) value = True if (outcomeValue > 2) else False try: v = BoolRxnDescriptorValue.objects.get( descriptor=outcomeBooleanDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: outBoolValue = outcomeBooleanDescriptor.createValue( p, value) outBoolValues.append(outBoolValue) value = int(r['purity']) if ( r['purity'] in ('1', '2')) else None try: v = OrdRxnDescriptorValue.objects.get( descriptor=purityDescriptor, reaction=p) if v.value != value: v.value = value v.save() except OrdRxnDescriptorValue.DoesNotExist: purityValue = purityDescriptor.createValue(p, value) purityValues.append(purityValue) value = (float(r['temp']) + 273.15) if (r['temp'] not in ('', '?')) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=temperatureDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: temperatureDescriptorValue = temperatureDescriptor.createValue( p, value) temperatureValues.append(temperatureDescriptorValue) value = float(r['time']) * 60 if (r['time'] not in ['', '?']) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=timeDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: timeDescriptorValue = timeDescriptor.createValue( p, value) timeValues.append(timeDescriptorValue) value = float(r['pH']) if ( r['pH'] not in ('', '?')) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=pHDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: pHDescriptorValue = pHDescriptor.createValue(p, value) pHValues.append(pHDescriptorValue) value = bool(r['pre_heat standing']) if ( r.get('pre_heat standing') not in ('', None)) else None try: v = NumRxnDescriptorValue.objects.get( descriptor=preHeatStandingDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: preHeatStandingDescriptorValue = preHeatStandingDescriptor.createValue( p, value) preHeatStandingValues.append( preHeatStandingDescriptorValue) value = bool(int(r['teflon_pouch'])) if ( r.get('teflon_pouch') not in(None, '')) else None try: v = BoolRxnDescriptorValue.objects.get( descriptor=teflonDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: teflonDescriptorValue = teflonDescriptor.createValue( p, value) teflonValues.append(teflonDescriptorValue) leak_string = r['leak'] if leak_string in (None, '', '?'): value = None elif leak_string.lower() == 'yes': value = True elif leak_string.lower() == 'no': value = False else: raise RuntimeError( "Unrecognized string '{}' in leak column".format(leak_string)) try: v = BoolRxnDescriptorValue.objects.get( descriptor=leakDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: leakDescriptorValue = leakDescriptor.createValue( p, value) leakValues.append(leakDescriptorValue) slow_cool_string = r['slow_cool'] if slow_cool_string in (None, '', '?'): value = None elif slow_cool_string.lower() == 'yes': value = True elif slow_cool_string.lower() == 'no': value = False else: raise RuntimeError( "Unrecognized string '{}' in slow_cool column".format(slow_cool_string)) try: v = BoolRxnDescriptorValue.objects.get( descriptor=slowCoolDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: slowCoolDescriptorValue = slowCoolDescriptor.createValue( p, value) slowCoolValues.append(slowCoolDescriptorValue) if len(outValues) > save_at_once: self.stdout.write("Saving outValues...") OrdRxnDescriptorValue.objects.bulk_create(outValues) outValues = [] self.stdout.write("...saved") if len(outBoolValues) > save_at_once: self.stdout.write("Saving outBoolValues...") BoolRxnDescriptorValue.objects.bulk_create( outBoolValues) outBoolValues = [] self.stdout.write("...saved") if len(purityValues) > save_at_once: self.stdout.write("Saving purityValues...") OrdRxnDescriptorValue.objects.bulk_create(purityValues) purityValues = [] self.stdout.write("...saved") if len(temperatureValues) > save_at_once: self.stdout.write("Saving temperatureValues...") NumRxnDescriptorValue.objects.bulk_create( temperatureValues) temperatureValues = [] self.stdout.write("...saved") if len(timeValues) > save_at_once: self.stdout.write("Saving timeValues...") NumRxnDescriptorValue.objects.bulk_create(timeValues) timeValues = [] self.stdout.write("...saved") if len(pHValues) > save_at_once: self.stdout.write("Saving pHValues...") NumRxnDescriptorValue.objects.bulk_create(pHValues) pHValues = [] self.stdout.write("...saved") if len(preHeatStandingValues) > save_at_once: self.stdout.write("Saving preHeatStandingValues...") NumRxnDescriptorValue.objects.bulk_create( preHeatStandingValues) preHeatStandingValues = [] self.stdout.write("...saved") if len(teflonValues) > save_at_once: self.stdout.write("Saving teflonValues...") BoolRxnDescriptorValue.objects.bulk_create( teflonValues) teflonValues = [] self.stdout.write("...saved") if len(leakValues) > save_at_once: self.stdout.write("Saving leakValues...") BoolRxnDescriptorValue.objects.bulk_create(leakValues) leakValues = [] self.stdout.write("...saved") if len(slowCoolValues) > save_at_once: self.stdout.write("Saving slowCoolValues...") BoolRxnDescriptorValue.objects.bulk_create( slowCoolValues) slowCoolValues = [] self.stdout.write("...saved") self.stdout.write("Saving all remaining values...") OrdRxnDescriptorValue.objects.bulk_create(outValues) BoolRxnDescriptorValue.objects.bulk_create(outBoolValues) OrdRxnDescriptorValue.objects.bulk_create(purityValues) NumRxnDescriptorValue.objects.bulk_create(temperatureValues) NumRxnDescriptorValue.objects.bulk_create(timeValues) NumRxnDescriptorValue.objects.bulk_create(pHValues) NumRxnDescriptorValue.objects.bulk_create( preHeatStandingValues) BoolRxnDescriptorValue.objects.bulk_create(teflonValues) BoolRxnDescriptorValue.objects.bulk_create(leakValues) BoolRxnDescriptorValue.objects.bulk_create(slowCoolValues) outValues = [] outBoolValues = [] purityValues = [] temperatureValues = [] timeValues = [] pHValues = [] preHeatStandingValues = [] teflonValues = [] leakValues = [] slowCoolValues = [] self.stdout.write("...saved") with open(path.join(folder, 'compoundquantities.tsv')) as cqs, open(path.join(folder, 'compoundquantities_fixed.tsv'), 'a') as fixed_cqs: self.stdout.write('Creating or updating compound quantities') reader = csv.DictReader(cqs, delimiter='\t') writer = csv.DictWriter( fixed_cqs, reader.fieldnames + ['compound.old_abbrev'], delimiter='\t') if not (start_at_quantities and start_number > 0): writer.writeheader() quantities = [] cs = ChemSpider(settings.CHEMSPIDER_TOKEN) for i, r in enumerate(reader): if start_at_quantities and (i < start_number): continue if not (r['compound.abbrev'] or r['compoundrole.name'] or r['amount']): # this is just a blank entry continue legacyID = r['reaction.id'] reaction = PerformedReaction.objects.get(legacyID=legacyID) compound_abbrev = r['compound.abbrev'] correct_abbrev = reagent_dict[ compound_abbrev] if compound_abbrev in reagent_dict else compound_abbrev compound_found = False while correct_abbrev and not compound_found: try: compound = Compound.objects.get( abbrev=correct_abbrev, labGroup=reaction.labGroup) compound_found = True r['compound.old_abbrev'] = r['compound.abbrev'] r['compound.abbrev'] = correct_abbrev except Compound.DoesNotExist: if no_compound_prompts: correct_abbrev = '' else: self.stderr.write( 'Could not find compound with abbreviation {}. Checking chemspider...'.format(correct_abbrev)) results = cs.simple_search(correct_abbrev) if len(results) != 1: CSID = raw_input( 'Could not find unique compound with abbreviation {}. Do you know the CSID? '.format(correct_abbrev)) if CSID.isdigit(): results = cs.simple_search(CSID) if len(results) == 1: compound = None try: compound = Compound.objects.get( CSID=results[0].csid, labGroup=reaction.labGroup) except Compound.DoesNotExist: pass user_response = None while user_response is None: if compound is None: user_verification = raw_input('Found unique compound with CSID {} and name {} for abbreviation {}. This is NOT IN THE COMPOUND GUIDE. Is this correct? (y/n): '.format( results[0].csid, results[0].common_name, correct_abbrev)) else: user_verification = raw_input('Found unique compound with CSID {}, name {}, and abbreviation {} for abbreviation {} in the compound guide. Is this correct? (y/n): '.format( compound.CSID, compound.name, compound.abbrev, correct_abbrev)) if user_verification and user_verification.lower()[0] == 'y': user_response = True elif user_verification and user_verification.lower()[0] == 'n': user_response = False if user_response: if compound is not None: correct_abbrev = compound.abbrev reagent_dict[ compound_abbrev] = correct_abbrev continue else: self.stderr.write('Creating compound with CSID {}, abbrevation {}, name {}'.format( results[0].csid, correct_abbrev, results[0].common_name)) c = Compound( CSID=results[0].csid, labGroup=reaction.labGroup, abbrev=correct_abbrev) try: c.csConsistencyCheck() c.save(invalidateReactions=False) continue except ValidationError: c.delete() raise self.stderr.write( 'Could not get unambiguous chemspider entry for abbreviation {}'.format(correct_abbrev)) self.stderr.write('Unknown Reactant {} with amount {} {} in reaction {}'.format( r['compound.abbrev'], r['amount'], r['unit'], r['reaction.reference'])) correct_abbrev = raw_input( 'What is the correct abbreviation for this? ') reagent_dict[compound_abbrev] = correct_abbrev if compound_found: self.stdout.write('{}: Creating quantity for compound {} and reaction {}'.format( i, compound.abbrev, reaction.reference)) if r['compound.abbrev'] in ('water', 'H2O'): r['density'] = 1 try: mw = NumMolDescriptorValue.objects.get( compound=compound, descriptor__heading='mw').value except NumMolDescriptorValue.DoesNotExist: compound.save(invalidateReactions=False) mw = NumMolDescriptorValue.objects.get( compound=compound, descriptor__heading='mw').value if r['compound.old_abbrev'] is not None and r['compound.old_abbrev'] != r['compound.abbrev']: reaction.notes += ' Compound abbreviation {} changed to {}'.format( r['compound.old_abbrev'], r['compound.abbrev']) reaction.save(calcDescriptors=False) if r['compoundrole.name'] == 'pH': reaction.notes += ' pH adjusting reagent used: {}, {}{}'.format( compound, r['amount'], r['unit']) reaction.save(calcDescriptors=False) else: compoundrole = None while compoundrole is None and r['compoundrole.name'] in (None, '', '?'): if r['compoundrole.name'] in (None, '', '?'): classes = compound.chemicalClasses.all() if classes.count() > 1: self.stderr.write( '{} has more than one chemical class: {}'.format(compound, classes)) role_label = raw_input('Which is the correct role for reagent {} in reaction {} with amount {} {}'.format( compound, reaction, r['amount'], r['unit'])) elif classes.count() == 0: self.stderr.write( '{} has no chemical classes'.format(compound)) role_label = raw_input( 'What chemical class does {} belong to? '.format(compound)) cc = ChemicalClass.objects.get( label=role_label) compound.chemicalClasses.add(cc) # Sanity check assert( compound.chemicalClasses.all().count() == 1) else: # count == 1 role_label = classes[0].label self.stderr.write('No reaction role listed for reagent {} with amount {} {} in reaction {}. ' 'Using chemical class {}'.format(compound, reaction, r['amount'], r['unit'], role_label)) r['compoundrole.name'] = role_label else: role_label = r['compoundrole.name'] if not role_label: reaction.notes += ' No role for reactant {} with amount {} {}'.format( r['compound.abbrev'], r['amount'], r['unit']) reaction.save(calcDescriptors=False) else: try: compoundrole = CompoundRole.objects.get( label=role_label) except CompoundRole.DoesNotExist: user_response = None if role_label in role_dict: new_role_label = role_dict[role_label] else: while user_response is None: user_verification = raw_input( 'Compound role {} does not exist. Would you like to add it? ') if user_verification and user_verification.lower()[0] == 'y': user_response = True elif user_verification and user_verification.lower()[0] == 'n': user_response = False if user_response: compoundrole = CompoundRole.objects.create( label=role_label) else: new_role_label = raw_input( 'What should this label be? ') role_dict[ role_label] = new_role_label r['compoundrole.name'] = new_role_label self.stdout.write('\tadding {} with role {} to {}'.format( compound.abbrev, role_label, reaction.reference)) if r['amount'] in ('', '?'): amount = None reaction.notes += ' No amount for reactant {} with role {}'.format( r['compound.abbrev'], r['compoundrole.name']) reaction.save(calcDescriptors=False) elif r['unit'] == 'g': amount = float(r['amount']) / mw elif r['unit'] == 'd' or r['unit'] == 'mL': valid_density = False while not valid_density: if compound.abbrev in density_dict: r['density'] = density_dict[ compound.abbrev] try: density = float(r['density']) valid_density = True except (TypeError, ValueError): self.stderr.write("Density '{}' cannot be converted to float. (Compound {} with amount {} {} in reaction {})".format( r['density'], compound, r['amount'], r['unit'], reaction)) r['density'] = raw_input( 'What is the density? ') density_dict[compound.abbrev] = r[ 'density'] if r['unit'] == 'd': amount = float( r['amount']) * 0.0375 * density / mw elif r['unit'] == 'mL': amount = float( r['amount']) * density / mw else: raise RuntimeError('invalid unit entered') # convert to millimoles if amount is not None: amount = (amount * 1000) cqq = CompoundQuantity.objects.filter( compound=compound, reaction=reaction) if cqq.exists(): cqq.delete() quantity = CompoundQuantity( compound=compound, reaction=reaction, role=compoundrole, amount=amount) quantities.append(quantity) if len(quantities) > save_at_once: self.stdout.write('Saving...') CompoundQuantity.objects.bulk_create( quantities) quantities = [] else: self.stderr.write('Unknown Reactant {} with amount {} {} in reaction {}'.format( r['compound.abbrev'], r['amount'], r['unit'], r['reaction.reference'])) reaction.notes += ' Unknown Reactant {} with amount {} {}'.format( r['compound.abbrev'], r['amount'], r['unit']) reaction.valid = False reaction.save(calcDescriptors=False) writer.writerow(r) self.stdout.write('Saving...') CompoundQuantity.objects.bulk_create(quantities) quantities = []
description="Script to obtain SMILES for a solutes in a list") argparser.add_argument('-db', '--db', help="the molecule database") argparser.add_argument('-solvent', '--solvent', help="the solvent", default="water") argparser.add_argument('-solutes', '--solutes', help="the list of solutes") args = argparser.parse_args() db = dblib.SolvDb(filename=args.db, type="abs", filehandle="^0") solutes = [s.strip() for s in open(args.solutes, 'r').readlines()] if os.getenv("SPIDERKEY") is None: print "SPIDERKEY environmental variable not set! Exit." quit() cs = ChemSpider(os.getenv("SPIDERKEY")) # Loop over all the database entries in the solute lists n = 0 for entry in db.itersolutelist(args.solvent, solutes): if os.path.exists(entry.FileHandle + ".smi"): continue hits = cs.search(entry.SoluteName) if len(hits) > 0: smi = hits[0].smiles with open(entry.FileHandle + ".smi", "w") as f: f.write("%s\n" % smi) else: print entry.SoluteName, entry.FileHandle n += 1 print "Looped over %d solutes" % n
def handle(self, *args, **kwargs): folder = kwargs['directory'] #if not path.isfile(path.join(folder, 'performedReactionsNoDupsLower.tsv')): #self.stdout.write('Writing file with all references that were uppercase (now lower) and duplicate references disambiguated (arbitrarily)') #with open(path.join(folder, 'performedReactions.tsv')) as in_file, open(path.join(folder, 'performedReactionsNoDupsLower.tsv'), 'w') as out_file: #references = set() #reader = csv.DictReader(in_file, delimiter='\t') #writer = csv.DictWriter(out_file, delimiter='\t', fieldnames=reader.fieldnames) #writer.writeheader() #case_count = 0 #valid_case_count = 0 #dup_count = 0 #for r in reader: #ref = r['reference'].lower() #if ref != r['reference']: #self.stderr.write('Reference {} was not in lowercase. Converted.'.format(r['reference'])) #case_count += 1 #if r['valid'] == '1': #valid_case_count += 1 #if ref in references: #r['notes'] += ' Duplicated reference' #r['valid'] = 0 #dup_count += 1 #i = 1 #new_ref = ref #while new_ref in references: #new_ref = '{}_dup{}'.format(ref, i) #i += 1 #self.stderr.write('Reference {} duplicated {} times. Renamed and invalidated'.format(ref, i)) #ref = new_ref #references.add(ref) #r['reference'] = ref #writer.writerow(r) #self.stderr.write('{} references converted to lowercase. {} were valid'.format(case_count, valid_case_count)) #self.stderr.write('{} references with _dupX appended to remove duplicate reference'.format(dup_count)) #with open(path.join(folder, 'performedReactionsNoDupsLower.tsv')) as reactions: #reader = csv.DictReader(reactions, delimiter='\t') #for r in reader: #if not PerformedReaction.objects.filter(reference=r['reference'].lower()).exists(): #p = PerformedReaction( #reference = r['reference'], #labGroup = LabGroup.objects.get(title=r['labGroup.title']), #notes = r['notes'], #user = User.objects.get(username=r['user.username']), #valid = int(r['valid']), #legacyRecommendedFlag=r['legacyRecommendedFlag']=='Yes', #insertedDateTime=r['insertedDateTime'], #public=int(r['public']) #) #self.stdout.write('Creating reaction with reference {}'.format(p.reference)) #p.validate_unique() #p.save(calcDescriptors=False) with open(path.join(folder, 'performedReactionsNoDupsLower.tsv')) as reactions: reader = csv.DictReader(reactions, delimiter='\t') outValues = [] outBoolValues = [] purityValues = [] temperatureValues = [] timeValues = [] pHValues = [] preHeatStandingValues = [] teflonValues = [] for r in reader: self.stdout.write('Reiterating for reaction with reference {}'.format(r['reference'].lower())) ps = PerformedReaction.objects.filter(reference=r['reference'].lower()) if ps.count() > 1: ps = ps.filter(valid=True) if ps.exists(): if ps.count() > 1: raise RuntimeError('{} has more than one reaction'.format(r['reference'].lower())) p = ps[0] try: p.duplicateOf = PerformedReaction.objects.get(reference=r['duplicateOf.reference'].lower()) p.save() except PerformedReaction.DoesNotExist: pass #outValue = OrdRxnDescriptorValue.objects.get_or_create(descriptor=outcomeDescriptor, reaction=p)[0] outcomeValue = int(r['outcome']) if (r['outcome'] in (str(x) for x in range (1, 5))) else None try: v = OrdRxnDescriptorValue.objects.get(descriptor=outcomeDescriptor, reaction=p) if v.value != outcomeValue: v.value = outcomeValue v.save() except OrdRxnDescriptorValue.DoesNotExist: outValue = outcomeDescriptor.createValue(p, outcomeValue) #outValue.save() outValues.append(outValue) #outBoolValue = BoolRxnDescriptorValue.objects.get_or_create(descriptor=outcomeBooleanDescriptor, reaction=p)[0] value = True if (outcomeValue > 2) else False try: v = BoolRxnDescriptorValue.objects.get(descriptor=outcomeBooleanDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: #outBoolValue.save() outBoolValue = outcomeBooleanDescriptor.createValue(p, value) outBoolValues.append(outBoolValue) #purityValue = OrdRxnDescriptorValue.objects.get_or_create(descriptor=purityDescriptor, reaction=p)[0] value = int(r['purity']) if (r['purity'] in ('1', '2')) else None try: v = OrdRxnDescriptorValue.objects.get(descriptor=purityDescriptor, reaction=p) if v.value != value: v.value = value v.save() except OrdRxnDescriptorValue.DoesNotExist: #purityValue.save() purityValue = purityDescriptor.createValue(p, value) purityValues.append(purityValue) #temperatureDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=temperatureDescriptor, reaction=p)[0] value = (float(r['temp']) + 273.15) if (r['temp'] not in ('', '?')) else None try: v = NumRxnDescriptorValue.objects.get(descriptor=temperatureDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: #temperatureDescriptorValue.save() temperatureDescriptorValue = temperatureDescriptor.createValue(p, value) temperatureValues.append(temperatureDescriptorValue) #timeDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=timeDescriptor, reaction=p)[0] value = float(r['time'])*60 if (r['time'] not in ['', '?']) else None try: v = NumRxnDescriptorValue.objects.get(descriptor=timeDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: #timeDescriptorValue.save() timeDescriptorValue = timeDescriptor.createValue(p, value) timeValues.append(timeDescriptorValue) #pHDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=pHDescriptor, reaction=p)[0] value = float(r['pH']) if (r['pH'] not in ('', '?')) else None try: v = NumRxnDescriptorValue.objects.get(descriptor=pHDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: #pHDescriptorValue.save() pHDescriptorValue = pHDescriptor.createValue(p, value) pHValues.append(pHDescriptorValue) #preHeatStandingDescriptorValue = NumRxnDescriptorValue.objects.get_or_create(descriptor=preHeatStandingDescriptor, reaction=p)[0] value = bool(r['pre_heat standing']) if (r.get('pre_heat standing') not in ('', None)) else None try: v = NumRxnDescriptorValue.objects.get(descriptor=preHeatStandingDescriptor, reaction=p) if v.value != value: v.value = value v.save() except NumRxnDescriptorValue.DoesNotExist: #preHeatStandingDescriptorValue.save() preHeatStandingDescriptorValue = preHeatStandingDescriptor.createValue(p, value) preHeatStandingValues.append(preHeatStandingDescriptorValue) #teflonDescriptorValue = BoolRxnDescriptorValue.objects.get_or_create(descriptor=teflonDescriptor, reaction=p)[0] value = bool(int(r['teflon_pouch'])) if (r.get('teflon_pouch') not in(None, '')) else None try: v = BoolRxnDescriptorValue.objects.get(descriptor=teflonDescriptor, reaction=p) if v.value != value: v.value = value v.save() except BoolRxnDescriptorValue.DoesNotExist: #teflonDescriptorValue.save() teflonDescriptorValue = teflonDescriptor.createValue(p, value) teflonValues.append(teflonDescriptorValue) if len(outValues) > 500: self.stdout.write("Saving...") OrdRxnDescriptorValue.objects.bulk_create(outValues) BoolRxnDescriptorValue.objects.bulk_create(outBoolValues) OrdRxnDescriptorValue.objects.bulk_create(purityValues) NumRxnDescriptorValue.objects.bulk_create(temperatureValues) NumRxnDescriptorValue.objects.bulk_create(timeValues) NumRxnDescriptorValue.objects.bulk_create(pHValues) NumRxnDescriptorValue.objects.bulk_create(preHeatStandingValues) BoolRxnDescriptorValue.objects.bulk_create(teflonValues) outValues = [] outBoolValues = [] purityValues = [] temperatureValues = [] timeValues = [] pHValues = [] preHeatStandingValues = [] teflonValues = [] self.stdout.write("...saved") with open(path.join(folder, 'compound_labs.tsv')) as compounds: reader = csv.DictReader(compounds, delimiter='\t') cs = ChemSpider(settings.CHEMSPIDER_TOKEN) for r in reader: l = LabGroup.objects.get(title=r['labGroup.title']) if not Compound.objects.filter(abbrev=r['abbrev']).exists(): self.stdout.write('Importing compound with abbreviation {} and name {}'.format(r['abbrev'], r['name'])) if r.get('custom') != '1': try: if r.get('CSID') not in ('', None): c = Compound(CSID=r['CSID'], labGroup=l, abbrev=r['abbrev']) c.csConsistencyCheck() c.save() else: if r.get('CAS_ID') not in (None, ''): CASResults = cs.simple_search(r['CAS_ID']) else: CASResults = [] if len(CASResults) != 1: nameResults = cs.simple_search(r.get('name')) if len(nameResults) != 1: raise RuntimeError('Could not get unambiguous chemspider entry for CAS ID {} with name {}. Got {} responses'.format(r['CAS_ID'], r['name'], len(CASResults))) else: c = Compound(CSID=nameResults[0].csid, labGroup=l, abbrev=r['abbrev']) c.csConsistencyCheck() c.save() else: c = Compound(CSID=CASResults[0].csid, labGroup=l, abbrev=r['abbrev']) c.csConsistencyCheck() c.save() except ValidationError as e: c.delete() raise e else: if r.get('INCHI') is None: r['INCHI'] = '' if r.get('smiles') is None: r['smiles'] = '' c = Compound.objects.get_or_create(labGroup=l, custom=True, name=r['name'], abbrev=r['abbrev'], formula=r['formula'], smiles=r['smiles'], INCHI=r['INCHI'])[0] self.stdout.write(c.name.encode('utf-8')) c.save() with open(path.join(folder, 'compound_chemicalClasses.tsv')) as chemicalClasses: reader = csv.DictReader(chemicalClasses, delimiter='\t') for r in reader: self.stdout.write('working with class {}'.format(r['chemicalClass.label'])) cs = Compound.objects.filter(abbrev=r['compound.abbrev']) if cs.count() > 0: c1 = ChemicalClass.objects.get_or_create(label=r['chemicalClass.label'])[0] for c2 in cs: if not c1 in c2.chemicalClasses.all(): c2.chemicalClasses.add(c1) c2.save() with open(path.join(folder, 'compoundquantities.tsv')) as cqs: reader = csv.DictReader(cqs, delimiter='\t') quantities = [] for r in reader: try: reaction = PerformedReaction.objects.get(reference=r['reaction.reference'].lower()) compound = Compound.objects.get(abbrev=r['compound.abbrev'], labGroup=reaction.labGroup) if r['compound.abbrev'] in ('water', 'H2O'): r['density'] = 1 mw = NumMolDescriptorValue.objects.get(compound=compound, descriptor__heading='mw').value if r['compoundrole.name'] != 'pH': self.stdout.write('adding {} to {}'.format(compound.abbrev, reaction.reference)) compoundrole = CompoundRole.objects.get_or_create(label=r['compoundrole.name'])[0] if r['amount'] in ('', '?'): amount = None elif r['unit'] == 'g': amount = float(r['amount'])/mw elif r['unit'] == 'd': amount = float(r['amount'])*0.0375*float(r['density'])/mw elif r['unit'] == 'mL': amount = float(r['amount'])*float(r['density'])/mw else: raise RuntimeError('invalid unit entered') if amount is not None: amount = (amount * 1000) cqq = CompoundQuantity.objects.filter(role=compoundrole, compound=compound, reaction=reaction) if cqq.count() > 1: cqq.delete() elif cqq.count() == 0: quantity = CompoundQuantity(role=compoundrole, compound=compound, reaction=reaction) quantity.amount = amount #quantities.append(quantity) quantity.save(recalculate=False) else: reaction.notes += ' pH adjusting reagent used: {}, {}{}'.format(r['compound.abbrev'], r['amount'], r['unit']) reaction.save(calcDescriptors=False) except Compound.DoesNotExist as e: self.stderr.write('Unknown Reactant {} with amount {} {} in reaction {}'.format(r['compound.abbrev'], r['amount'], r['unit'], r['reaction.reference'])) reaction.notes += ' Unknown Reactant {} with amount {} {}'.format(r['compound.abbrev'], r['amount'], r['unit']) reaction.valid = False reaction.save(calcDescriptors=False) except PerformedReaction.DoesNotExist as e: raise e
class make_descs: def __init__(self,argv): ''' load input arguments''' self.CAS_file = argv[1] ''' my chemsphder token ''' self.cs=ChemSpider('d1778a9f-c41f-41f6-920e-fc6d9ff739ca') def querySMILEs(self): ''' Read CAS file as input To calculate SMILEs through querying chemspider ''' index_num=-1 self.CAS = [] self.SMILEs=[] self.missing = [] with open (self.CAS_file,'rb') as csvfile: csv_read=csv.reader(csvfile,delimiter=',') for row in csv_read: index_num += 1 row = row[0] row = row.split('//') row = row[0] chem_this=self.cs.search(row) try: print "Working on chemical ", index_num, row # raw_input(chem_this[0].smiles) self.CAS.append(row) self.SMILEs.append(chem_this[0].smiles) except IndexError: print "Can't find index: ", index_num, row self.SMILEs.append(row) self.missing.append(index_num) continue # delete missing rows for index in sorted(self.missing,reverse=True): del self.CAS[index] del self.SMILEs[index] # Clean up SMILEs self._cleanSMILEs() # Write SMILEs self._writeSEMILs() return self.SMILEs def _writeSEMILs(self): resultsfile=open('./cas/SMILEs.csv','wb') wr=csv.writer(resultsfile,dialect='excel') for eachSMILEs in self.SMILEs: wr.writerow([eachSMILEs]) def _cleanSMILEs(self): ''' Clean up SMILEs Make sure there is not '+', "-", and "." ''' assert self.SMILEs is not None for indx, eachSMILEs in enumerate(self.SMILEs): eachSMILEs = eachSMILEs.replace("+","") eachSMILEs = eachSMILEs.replace("-","") eachSMILEs = eachSMILEs.replace(".","") self.SMILEs[indx] = eachSMILEs print "SMILEs have been cleaned up!" def calculateDescs(self): ''' Use subprocess to call Dragon command line shell And calculate predefined descriptors. These descriptors are suited for the LCIA module at this time ''' dragonShellCall = "./Dragon/dragon6shell.exe" dragonScriptCall = "./Dragon/test_script.drs" proc = sb.Popen([dragonShellCall,"-s" ,dragonScriptCall])
def __init__(self): sett = SettingsConstants() self.key = sett.get('CHEMSPI_KEY') self.url = sett.get('CHEMSPI_API_URL') self.cs = ChemSpider(self.key, api_url=self.url)
import CoolProp from chemspipy import ChemSpider from chemspipy_key import key # private file with the key (DO NOT COMMIT!!) import glob, json cs = ChemSpider(key) # Map from name to Chemspider ID backup_map = { 'Propyne': 6095, 'R236EA': 71342, 'R245ca': 62827, 'trans-2-Butene': 56442, 'Oxygen': 952, 'Fluorine': 22932, 'Hydrogen': 762, 'Deuterium': 22931, 'HFE143m': 66577, 'SulfurHexafluoride': 16425, 'R114': 13853215 } # Make sure the key works c = cs.get_compound(2157) assert(c.inchikey == 'BSYNRYMUTXBXSQ-UHFFFAOYAW') for fname in glob.glob('../fluids/*.json'): with open(fname,'r') as fp: jj = json.load(fp) fluid = jj['INFO']['NAME']
# Tool uses the ChemSpiPy library to assist in accessing the ChemSpider Database # Syntax to run command: python ChemSpider.py -(f/n) term # -f name -> get the formula for the common name formula # -n formula -> get the common name for the formula import sys # allows us to use command line arguments if len(sys.argv) < 3: print("Incorrect input.\n\t==> python ChemSpider.py [-f/-n] <argument>") sys.exit() from chemspipy import ChemSpider cs = ChemSpider('3e05e0a6-9f49-4dff-ba0e-a9d6ca3d04ea') # imports the ChemSpider api, and passes our access token to it for result in cs.search(sys.argv[2])[:5]: # Give the first five results for -f. if sys.argv[1] == "-f": print(result.common_name) print(result.molecular_formula) # print(result.common_name) if sys.argv[1] == "-n": print(result.common_name) break
if not args.from_db: from chemspipy import ChemSpider if not args.export_db_only: import pandas as pd if args.from_db or args.export_db_only or args.export_db_csv: import shelve ## ==================== set up chemspider ==================== if not args.from_db: possiblefile = os.path.expanduser(args.token) if os.path.exists(possiblefile): # is file with open(possiblefile) as f: csp = ChemSpider(f.read().strip()) else: csp = ChemSpider(args.token) # else is token else: csp = None spq = spiderquery(csp, args.prefix + '_p') ## ==================== list of compounds ==================== if args.inputfile: with open(args.inputfile) as csvfile: f = csv.reader(csvfile) compounds = [] j = 0 for i, row in enumerate(f):
import os from chemspipy import ChemSpider from sgenlib import smiles if __name__ == "__main__": parser = argparse.ArgumentParser(description="Program to find molecules in ChemSpdie",) parser.add_argument('filename',help="the filename of a list") args = parser.parse_args() if os.getenv("SPIDERKEY") is None : print "SPIDERKEY environmental variable not set! Exit." quit() cs = ChemSpider(os.getenv("SPIDERKEY")) molecules = [] with open(args.filename,"r") as f : molecules = [line.strip() for line in f.readlines()] for mol in molecules: hits = cs.search(mol) if len(hits) == 0 : print mol+"\t!!" else : """try : print "//".join([h.common_name for h in hits]) except : print mol+"\t!!!" """
if __name__ == '__main__': argparser = argparse.ArgumentParser(description="Script to obtain SMILES for a solutes in a list") argparser.add_argument('-db', '--db', help="the molecule database") argparser.add_argument('-solvent', '--solvent', help="the solvent", default="water") argparser.add_argument('-solutes','--solutes',help="the list of solutes") args = argparser.parse_args() db = dblib.SolvDb(filename=args.db,type="abs",filehandle="^0") solutes = [s.strip() for s in open(args.solutes,'r').readlines()] if os.getenv("SPIDERKEY") is None : print "SPIDERKEY environmental variable not set! Exit." quit() cs = ChemSpider(os.getenv("SPIDERKEY")) # Loop over all the database entries in the solute lists n = 0 for entry in db.itersolutelist(args.solvent,solutes): if os.path.exists(entry.FileHandle+".smi") : continue hits = cs.search(entry.SoluteName) if len(hits) > 0 : smi = hits[0].smiles with open(entry.FileHandle+".smi","w") as f : f.write("%s\n"%smi) else : print entry.SoluteName, entry.FileHandle n += 1 print "Looped over %d solutes"%n
from chemspipy import ChemSpider from sgenlib import smiles if __name__ == "__main__": parser = argparse.ArgumentParser( description="Program to build molecules", ) parser.add_argument('filename', help="the filename of a list") args = parser.parse_args() if os.getenv("SPIDERKEY") is None: print "SPIDERKEY environmental variable not set! Exit." quit() cs = ChemSpider(os.getenv("SPIDERKEY")) lines = [] with open(args.filename, "r") as f: lines = [line.strip() for line in f.readlines()] molecules = sorted(list(set(lines)), cmp=lambda x, y: cmp(lines.index(x), lines.index(y))) for mol in molecules: hits = cs.search(mol) if len(hits) == 0: print mol + "\t!!" else: molsmiles = hits[0].smiles mol2 = mol.strip() mol2.replace(" ", "_")
from nose.tools import eq_, ok_, raises import requests import six from chemspipy import ChemSpider, MOL2D, MOL3D, BOTH from chemspipy.errors import ChemSpiPyAuthError, ChemSpiPyServerError logging.basicConfig(level=logging.WARN) logging.getLogger('chemspipy').setLevel(logging.DEBUG) # Security token is retrieved from environment variables CHEMSPIDER_SECURITY_TOKEN = os.environ['CHEMSPIDER_SECURITY_TOKEN'] # Chemspider instances with and without a security token cs = ChemSpider(security_token=CHEMSPIDER_SECURITY_TOKEN) cs2 = ChemSpider() def test_no_security_token(): """Test ChemSpider can be initialized with no parameters.""" eq_(cs2.security_token, None) def test_security_token(): """Test security token is set correctly when initializing ChemSpider""" eq_(cs.security_token, CHEMSPIDER_SECURITY_TOKEN) def test_chemspider_repr(): """Test ChemSpider object repr."""
return random.choice(security_token) else: print("You need Security_token.txt providing security token. Please contact me as soon as.") # print(tokenchoice()) if os.path.isfile('chemspiderdb.json'): spiderjsonfileid = [] with open('chemspiderdb.json', 'r') as jsonfile: for f in jsonfile.readlines(): the_dict = json.loads(f) spiderjsonfileid.append(the_dict['_id']) # print(spiderjsonfileid) for csid in csids: # cskey = random.choice(cs_security_key) cs = ChemSpider(tokenchoice()) if csid in spiderjsonfileid: print('{0} has been in the file'.format(str(csid))) continue compound = cs.get_compound(csid) try: doc = {'_id': int(compound.csid), 'common_name': compound.common_name} sleep(random.uniform(0.2, 0.5)) doc['molecular_weight'] = compound.molecular_weight sleep(random.uniform(0, 0.5)) doc['molecular_formula'] = compound.molecular_formula doc['stdinchi'] = compound.stdinchi sleep(random.uniform(0.1, 0.5)) doc['stdinchikey'] = compound.stdinchikey doc['smiles'] = compound.smiles # sleep(random.uniform(1, 1.1))
from django.utils.text import slugify from chemspipy import ChemSpider from professor_oak.models import ScoreMixin log = logging.getLogger(__name__) # Load the chemspider API for accessing the RSC structure database try: cs_key = settings.CHEMSPIDER_KEY except AttributeError: log.warn('CHEMSPIDER_KEY not found in localsettings.py') chemspider_api = None else: chemspider_api = ChemSpider(cs_key) class Hazard(models.Model): """A hazard type as defined by the global harmonized system. Attributes ---------- - pictogram : Image file that represents this image. If not provided, we will look in `static_files/ghs_pictograms/` for one that matches the `name` attribute """ PHYSICAL = 'p' HEALTH = 'h' PHYSICAL_AND_HEALTH = 'ph' ENVIRONMENTAL = 'e'
from chemspipy import ChemSpider cs = ChemSpider("CHEMSPIDER_API_KEY") def remove_prefix(disctext): """Removes the command prefix from the passed string""" cprefix = "!chem " if disctext.content.startswith(cprefix): return disctext.content[len(cprefix):] def getCompound(id): """Gets a compound object from a compound name or id""" slist = cs.search(id) # Returns list of compounds compound = slist[0] # Fetches top item of list as compound object return compound
import os import pickle import numpy as np import sys os.environ.setdefault("DJANGO_SETTINGS_MODULE", "ms2ldaviz.settings_simon") import django import jsonpickle django.setup() from chemspipy import ChemSpider from basicviz.models import * if __name__ == '__main__': cs = ChemSpider('b07b7eb2-0ba7-40db-abc3-2a77a7544a3d') exp_name = sys.argv[1] e = Experiment.objects.get(name = exp_name) print e docs = Document.objects.filter(experiment = e) for doc in docs: md = jsonpickle.decode(doc.metadata) ik = md.get('InChIKey',md.get('inchikey',None)) print ik # search in chemspi results = cs.search(ik) if len(results) > 0: m = results[0].mol_2d if len(m) > 0: doc.mol_string = m