Ejemplo n.º 1
0
def ingest_tsv(filepath):
    log.info('ingesting %s as tsv file', filepath)
    save_file_metadata(filepath, status='parsing', filetype='tsv')
    with open(filepath, 'rU') as fid:
        reader = csv.reader(fid, delimiter='\t')
        header = reader.next()
        log.debug("%d columns: %s", len(header), ", ".join(header))
        if len(header) == 0:
            raise ValueError('header row must contain at least one column')

        keys = [normalize_column_name(h) for h in header]

        def parse(row):
            if len(keys) == len(row):
                return dict(zip(keys, row))

        parsed = [parse(row) for row in reader]
        parsed = [v for v in parsed if v is not None]

        header = [{'raw': h, 'key': k} for h, k in itertools.izip(header, keys)]

        for h in header:
            data = [p[h['key']] for p in parsed]
            if all(is_boolean(d) for d in data):
                h['datatype'] = 'boolean'
            elif all(is_numeric(d) for d in data):
                h['datatype'] = 'numeric'
            else:
                h['datatype'] = 'string'

        save_file_metadata(filepath, headers=header)

        return parsed
Ejemplo n.º 2
0
 def upload(self, file, parser):
     filepath = os.path.join(app.dropbox_path, file.filename)
     _id = save_file_metadata(filepath, status="uploading", parser=parser)
     filepath = add_id(_id, filepath)
     file.save(filepath)
     log.info("uploaded {} as {}".format(file.filename, filepath))
     _id = save_file_metadata(filepath, status="uploaded")
     self.event.set()
     return _id
Ejemplo n.º 3
0
 def upload(self, file, parser):
     filepath = os.path.join(app.dropbox_path, file.filename)
     _id = save_file_metadata(filepath, status='uploading', parser=parser)
     filepath = add_id(_id, filepath)
     file.save(filepath)
     log.info('uploaded {} as {}'.format(file.filename, filepath))
     _id = save_file_metadata(filepath, status='uploaded')
     self.event.set()
     return _id
Ejemplo n.º 4
0
def ingest(filepath):
    _id, _ = split_id(filepath)

    client = pymongo.MongoClient()

    meta = client.files.meta.find_one({'_id': _id})

    if meta:
        parser = meta['parser']

        if parser == 'tsv':
            data = ingest_tsv(filepath)
        else:
            raise NotImplementedError('unknown parser %s'.format(parser))

        client.files[str(_id)].insert(data)

        return save_file_metadata(filepath, status='success', count=len(data))

    else:
        save_file_metadata(filepath, status='error')
        raise LookupError('no metadata found for {}'.format(filepath))
Ejemplo n.º 5
0
    def process(self, filepath):
        # ignore hidden files (e.g., .gitignore)
        if not os.path.basename(filepath)[0] == '.':
            try:
                processed = self.files[filepath]
            except KeyError:
                log.info(
                    'new file in dropbox %s created %s', filepath,
                    datetime.datetime.fromtimestamp(
                        os.path.getctime(filepath)))
                processed = False

            if not processed and not os.path.basename(
                    filepath) == '.gitignore':
                try:
                    _id = ingest.ingest(filepath)

                    # move file to ingested directory
                    dest = add_id(
                        _id,
                        os.path.join(app.ingested_path,
                                     os.path.basename(filepath)))
                    log.info('moving ingested file from %s to %s', filepath,
                             dest)
                    shutil.move(filepath, dest)

                except Exception as e:
                    log.warn('failed to ingest %s', filepath)
                    log.warn(e)

                    _id = save_file_metadata(filepath, status='error')

                    # move file to failed directory
                    dest = add_id(
                        _id,
                        os.path.join(app.failed_path,
                                     os.path.basename(filepath)))
                    log.info('moving failed file from %s to %s', filepath,
                             dest)
                    shutil.move(filepath, dest)

                processed = True

                self.files[filepath] = processed
Ejemplo n.º 6
0
    def process(self, filepath):
        # ignore hidden files (e.g., .gitignore)
        if not os.path.basename(filepath)[0] == ".":
            try:
                processed = self.files[filepath]
            except KeyError:
                log.info(
                    "new file in dropbox %s created %s",
                    filepath,
                    datetime.datetime.fromtimestamp(os.path.getctime(filepath)),
                )
                processed = False

            if not processed and not os.path.basename(filepath) == ".gitignore":
                try:
                    _id = ingest.ingest(filepath)

                    # move file to ingested directory
                    dest = add_id(_id, os.path.join(app.ingested_path, os.path.basename(filepath)))
                    log.info("moving ingested file from %s to %s", filepath, dest)
                    shutil.move(filepath, dest)

                except Exception as e:
                    log.warn("failed to ingest %s", filepath)
                    log.warn(e)

                    _id = save_file_metadata(filepath, status="error")

                    # move file to failed directory
                    dest = add_id(_id, os.path.join(app.failed_path, os.path.basename(filepath)))
                    log.info("moving failed file from %s to %s", filepath, dest)
                    shutil.move(filepath, dest)

                processed = True

                self.files[filepath] = processed