def ingest_tsv(filepath): log.info('ingesting %s as tsv file', filepath) save_file_metadata(filepath, status='parsing', filetype='tsv') with open(filepath, 'rU') as fid: reader = csv.reader(fid, delimiter='\t') header = reader.next() log.debug("%d columns: %s", len(header), ", ".join(header)) if len(header) == 0: raise ValueError('header row must contain at least one column') keys = [normalize_column_name(h) for h in header] def parse(row): if len(keys) == len(row): return dict(zip(keys, row)) parsed = [parse(row) for row in reader] parsed = [v for v in parsed if v is not None] header = [{'raw': h, 'key': k} for h, k in itertools.izip(header, keys)] for h in header: data = [p[h['key']] for p in parsed] if all(is_boolean(d) for d in data): h['datatype'] = 'boolean' elif all(is_numeric(d) for d in data): h['datatype'] = 'numeric' else: h['datatype'] = 'string' save_file_metadata(filepath, headers=header) return parsed
def upload(self, file, parser): filepath = os.path.join(app.dropbox_path, file.filename) _id = save_file_metadata(filepath, status="uploading", parser=parser) filepath = add_id(_id, filepath) file.save(filepath) log.info("uploaded {} as {}".format(file.filename, filepath)) _id = save_file_metadata(filepath, status="uploaded") self.event.set() return _id
def upload(self, file, parser): filepath = os.path.join(app.dropbox_path, file.filename) _id = save_file_metadata(filepath, status='uploading', parser=parser) filepath = add_id(_id, filepath) file.save(filepath) log.info('uploaded {} as {}'.format(file.filename, filepath)) _id = save_file_metadata(filepath, status='uploaded') self.event.set() return _id
def ingest(filepath): _id, _ = split_id(filepath) client = pymongo.MongoClient() meta = client.files.meta.find_one({'_id': _id}) if meta: parser = meta['parser'] if parser == 'tsv': data = ingest_tsv(filepath) else: raise NotImplementedError('unknown parser %s'.format(parser)) client.files[str(_id)].insert(data) return save_file_metadata(filepath, status='success', count=len(data)) else: save_file_metadata(filepath, status='error') raise LookupError('no metadata found for {}'.format(filepath))
def process(self, filepath): # ignore hidden files (e.g., .gitignore) if not os.path.basename(filepath)[0] == '.': try: processed = self.files[filepath] except KeyError: log.info( 'new file in dropbox %s created %s', filepath, datetime.datetime.fromtimestamp( os.path.getctime(filepath))) processed = False if not processed and not os.path.basename( filepath) == '.gitignore': try: _id = ingest.ingest(filepath) # move file to ingested directory dest = add_id( _id, os.path.join(app.ingested_path, os.path.basename(filepath))) log.info('moving ingested file from %s to %s', filepath, dest) shutil.move(filepath, dest) except Exception as e: log.warn('failed to ingest %s', filepath) log.warn(e) _id = save_file_metadata(filepath, status='error') # move file to failed directory dest = add_id( _id, os.path.join(app.failed_path, os.path.basename(filepath))) log.info('moving failed file from %s to %s', filepath, dest) shutil.move(filepath, dest) processed = True self.files[filepath] = processed
def process(self, filepath): # ignore hidden files (e.g., .gitignore) if not os.path.basename(filepath)[0] == ".": try: processed = self.files[filepath] except KeyError: log.info( "new file in dropbox %s created %s", filepath, datetime.datetime.fromtimestamp(os.path.getctime(filepath)), ) processed = False if not processed and not os.path.basename(filepath) == ".gitignore": try: _id = ingest.ingest(filepath) # move file to ingested directory dest = add_id(_id, os.path.join(app.ingested_path, os.path.basename(filepath))) log.info("moving ingested file from %s to %s", filepath, dest) shutil.move(filepath, dest) except Exception as e: log.warn("failed to ingest %s", filepath) log.warn(e) _id = save_file_metadata(filepath, status="error") # move file to failed directory dest = add_id(_id, os.path.join(app.failed_path, os.path.basename(filepath))) log.info("moving failed file from %s to %s", filepath, dest) shutil.move(filepath, dest) processed = True self.files[filepath] = processed