def _init(this, source, target, encoding=DEFAULT_ENCODING): super(populate_from_sdf, this)._init(target) assert tc.isstring(source) with open(source) as fh: data = fh.read().decode(encoding) this._sdfrecords = s2p.parse_sdf(data)
def __init__(self, data=None, types=None, fielddelimiter=FIELDDELIMITER, recorddelimiter=RECORDDELIMITER, prefix=PREFIX, suffix=SUFFIX, prologue=PROLOGUE, epilogue=EPILOGUE): self._format = _subdict(locals(), self._FORMATTING_ATTRS) self.__dict__.update(self._format) self._rows = rows = self._makerows(data, (types if types is None else tuple([None if (t is None or isstring(t())) else t for t in types]))) _h = len(rows) self._width = _w = max(len(r) for r in rows) if _h > 0 else 0
def __init__(self, data=None, types=None, fielddelimiter=FIELDDELIMITER, recorddelimiter=RECORDDELIMITER, prefix=PREFIX, suffix=SUFFIX, prologue=PROLOGUE, epilogue=EPILOGUE): self._format = _subdict(locals(), self._FORMATTING_ATTRS) self.__dict__.update(self._format) self._rows = rows = self._makerows( data, (types if types is None else tuple( [None if (t is None or isstring(t())) else t for t in types]))) _h = len(rows) self._width = _w = max(len(r) for r in rows) if _h > 0 else 0
def __init__(self, data, fielddelimiter=FIELDDELIMITER, recorddelimiter=RECORDDELIMITER, prefix=PREFIX, suffix=SUFFIX, prologue=PROLOGUE, epilogue=EPILOGUE, keep_empty=False): self.__dict__.update(_subdict(locals(), Worksheet._FORMATTING_ATTRS)) assert isstring(data) self._path = _path = data _all = xl.open_workbook(_path).sheets() wss = _all if keep_empty else [sh for sh in _all if sh.nrows > 0] self._sheets = sheets = tuple([Worksheet(s, parent=self) for s in wss]) names = [sh.name for sh in sheets] self._label_2_index = dict((v, i) for i, v in enumerate(names))
def main(path): """ Read in the sdf file """ # map field labels to model fields properties = ('model_field','required','default','converter') get_primary_name = lambda x: x.split(';')[0].strip() get_alternate_names = lambda x: ';'.join([x.strip() for x in x.split(';')[1:]]) labels = { s2p.MOLDATAKEY:('molfile',True), # NOTE: even though these db field are not integers, # it is convenient to convert the read in values to INT to make sure they are not interpreted as float values 'facility_reagent_id': ('facility_id',True,None, lambda x: util.convertdata(x[x.index('HMSL')+4:],int)), 'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)), 'lincs_id':('lincs_id',False), #None,lambda x:util.convertdata(x,int)), 'chemical_name':('name',True), 'alternative_names':'alternative_names', 'pubchem_cid':'pubchem_cid', 'chembl_id':'chembl_id', 'chebi_id':'chebi_id', 'inchi':'_inchi', 'inchi_key':'_inchi_key', 'smiles': ('_smiles',True), 'molecular_mass':('_molecular_mass',False,None, lambda x: round(util.convertdata(x, float),2)), 'molecular_formula':'_molecular_formula', 'software':'software', # 'concentration':'concentration', #'well_type':('well_type',False,'experimental'), 'is_restricted':('is_restricted',False,False,util.bool_converter)} # convert the labels to fleshed out dict's, with strategies for optional, default and converter labels = util.fill_in_column_definitions(properties,labels) assert typecheck.isstring(path) with open(path) as fh: data = fh.read().decode(DEFAULT_ENCODING) records = s2p.parse_sdf(data) logger.info(str(('read rows: ', len(records)))) count = 0 for record in records: logger.debug(str(('record', record))) initializer = {} for key,properties in labels.items(): logger.debug(str(('look for key: ', key, ', properties: ', properties))) required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] value = record.get(key) # Todo, refactor to a method try: logger.debug(str(('raw value', value))) if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == 'n/a'): value = None if(value == None and required == True): raise Exception(str(('Field is required: ', key, initializer, 'record:', count))) logger.debug(str(('model_field: ' , model_field, ', value: ', value))) initializer[model_field] = value except Exception, e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] logger.error(str((exc_type, fname, exc_tb.tb_lineno))) logger.error(str(('invalid input', e, 'count', count))) raise e # follows is a kludge, to split up the entered "chemical_name" field, on ';' - TODO: just have two fields that get entered if(initializer['name']): initializer['alternative_names']=get_alternate_names(initializer['name']) initializer['name']=get_primary_name(initializer['name']) if(logger.isEnabledFor(logging.DEBUG)): logger.debug(str(('initializer: ', initializer))) try: sm = SmallMolecule(**initializer) sm.save() logger.info(str(('sm created:', sm))) count += 1 except Exception, e: logger.error(str(('save failed for: ', initializer, 'error',e, 'count: ', count))) raise e
def main(path): properties = ('model_field','required','default','converter') get_primary_name = lambda x: x.split(';')[0].strip() get_alternate_names = ( lambda x: '; '.join([x.strip() for x in x.split(';')[1:]])) labels = { s2p.MOLDATAKEY:('molfile',True), 'facility_reagent_id': ( 'facility_id',True,None, lambda x: util.convertdata(x[x.index('HMSL')+4:],int)), 'salt_id': ('salt_id',True,None, lambda x: util.convertdata(x,int)), 'lincs_id':('lincs_id',False), 'chemical_name':('name',True), 'alternative_names':'alternative_names', 'pubchem_cid':'pubchem_cid', 'chembl_id':'chembl_id', 'chebi_id':'chebi_id', 'inchi':'_inchi', 'inchi_key':'_inchi_key', 'smiles': ('_smiles',False), 'molecular_mass':( '_molecular_mass',False,None, lambda x: round(util.convertdata(x, float),2)), 'relevant_citations': '_relevant_citations', 'molecular_formula':'_molecular_formula', 'software':'software', 'date_data_received':('date_data_received',False,None, util.date_converter), 'date_loaded': ('date_loaded',False,None,util.date_converter), 'date_publicly_available': ('date_publicly_available',False,None, util.date_converter), 'date_updated': ('date_updated',False,None,util.date_converter), 'is_restricted':('is_restricted',False,False,util.bool_converter) } labels = util.fill_in_column_definitions(properties,labels) assert typecheck.isstring(path) with open(path) as fh: data = fh.read().decode(DEFAULT_ENCODING) records = s2p.parse_sdf(data) logger.info('rows read: %d ', len(records)) count = 0 for record in records: initializer = {} for key,properties in labels.items(): required = properties['required'] default = properties['default'] converter = properties['converter'] model_field = properties['model_field'] value = record.get(key) try: if(converter != None): value = converter(value) if(value == None ): if( default != None ): value = default if(value == 'n/a'): value = None if(value == None and required == True): raise Exception( 'Field is required: %r, values: %r, row: %d' % (key,initializer,count)) initializer[model_field] = value except Exception, e: logger.exception('invalid input, row: %d', count) raise e # follows is a kludge, to split up the entered "chemical_name" field, # on ';' - TODO: just have two fields that get entered if(initializer['name']): initializer['alternative_names']=get_alternate_names(initializer['name']) initializer['name']=get_primary_name(initializer['name']) try: sm = SmallMolecule(**initializer) sm.save() count += 1 # create a default batch - 0 SmallMoleculeBatch.objects.create(reagent=sm,batch_id=0) except Exception: logger.exception('save failed for: %r, row: %d', initializer, count) raise