def addNewRecords(self): """XXX: should this be a classmethod?""" records = xrange(self.highnum, self.lownum, -1) for recordid in records: try: company = Company.byRecordid(recordid) print('already handled record %s' % recordid) continue except SQLObjectNotFound: filename = self.filename(recordid) try: #pagetext = codecs.open(filename, 'r', 'windows-1252').read() pagetext = open(filename, 'r').read() except IOError: if options.skipmissing: print('skipping missing record %s' % recordid) continue print('halting on missing record %s' % recordid) raise if not pagetext: log('%s is empty' % recordid) continue try: self.processRecord(pagetext) except AttributeError: traceback.print_exc() log('failed to process record %s' % recordid) continue print('added record %s' % recordid)
def processRecord(self, rawtext): session = Session() datadict = self.scrapeData(rawtext) crecord = Company( name = datadict['nombredelasociedad'], recordid = int(datadict['nodeficha']), scrape_date = None, scrape_source = None, is_current = None, data = None) try: # XXX does this really need to be a register thing dateobj = time.strptime(datadict['registerdate'], '%d-%m-%Y') cleandate = time.strftime('%Y-%m-%d', dateobj) crecord.date_founded = cleandate except ValueError: log('invalid date: %s' % datadict['registerdate']) for subscriber in datadict['suscriptores']: crecord.addPerson( role = 'subscriber', name = subscriber, session = session) if datadict['agent']: crecord.addPerson( role = 'agent', name = datadict['agent'], session = session) for director in datadict['directors']: crecord.addPerson( role = 'director', name = director, session = session) for (role, name) in datadict['titles'].items(): role = role.lower() title = self.officials.get(role, role) crecord.addPerson( role = title, name = name, session = session) session.commit() return crecord
def processRecord(self, rawtext): session = Session() datadict = self.scrapeData(rawtext) crecord = Company(name=datadict['nombredelasociedad'], recordid=int(datadict['nodeficha']), scrape_date=None, scrape_source=None, is_current=None, data=None) try: # XXX does this really need to be a register thing dateobj = time.strptime(datadict['registerdate'], '%d-%m-%Y') cleandate = time.strftime('%Y-%m-%d', dateobj) crecord.date_founded = cleandate except ValueError: log('invalid date: %s' % datadict['registerdate']) for subscriber in datadict['suscriptores']: crecord.addPerson(role='subscriber', name=subscriber, session=session) if datadict['agent']: crecord.addPerson(role='agent', name=datadict['agent'], session=session) for director in datadict['directors']: crecord.addPerson(role='director', name=director, session=session) for (role, name) in datadict['titles'].items(): role = role.lower() title = self.officials.get(role, role) crecord.addPerson(role=title, name=name, session=session) session.commit() return crecord