Esempio n. 1
0
 def _setContainerID(self, d, containerid=None):
     if not containerid:
         d['_id'] = str(generate_id(d))
         d['containerid'] = 0
     else:
         d['_id'] = str(generate_id(d))
         d['containerid'] = containerid
     return d['_id']
Esempio n. 2
0
    def upload_avatar(self, img):
        avatar_id = generate_id()
        filename = os.path.join(UPLOAD_FOLDER, 'avatars',
                                '{}.png'.format(avatar_id))

        if isinstance(img, str) and img.startswith('http'):
            r = requests.get(img, stream=True)
            if r.status_code == 200:
                with open(filename, 'wb') as f:
                    for chunk in r.iter_content(1024):
                        f.write(chunk)
        else:
            img.save(filename)
        self.update_avatar(avatar_id)
Esempio n. 3
0
    def run(self, path):
        """
        Returns:
            An iterator with the adapted JSON.
        """
        self.logger().info('Indexing: %s', path)
        self.check_params(path, check_from_module=True)

        name = self.myconfig('name').lower()
        doc_type = self.myconfig('doc_type')

        exit_status = ''
        # read tags from the section
        mytags = base.config.parse_conf_array(self.myconfig('tags'))
        try:
            for fileinfo in self.from_module.run(path):
                # save custom tags for this parser, if any
                if mytags:
                    fileinfo['tags'] = mytags
                # get or generate an identifier
                _id = str(generate_id(fileinfo))
                # if the fileinfo already provides an index name, use it. If not, use the default index name
                fileindex = fileinfo.pop(
                    '_index') if '_index' in fileinfo else name
                yield dict(_index=fileindex,
                           _type=doc_type,
                           _id=_id,
                           _source=fileinfo,
                           _op_type=self.myconfig('operation'))
            exit_status = 'ended'
        except base.job.RVTError as exc:
            # After an error, log as a warning and end the module
            import traceback
            tb = traceback.format_exc()
            self.logger().warning(tb)
            self.logger().error(str(exc))
            exit_status = 'error'
        except KeyboardInterrupt:
            # if the module was interrupted
            exit_status = 'interrupted'
Esempio n. 4
0
    def _post_parse_file_single(self, filepath, filemetadata, status=200):
        """ Gets the output from TIKA, maps fields and remove content if it is too large.

        Args:
            filepath (str): the path to the file
            filemetatada (dict): a dictionary of metadata, as returned by tika
            status (int): the status returned by tika server (200=OK)

        Returns:
            A list with a single item, which contains the file metadata. """
        item = self._common_fields(filepath)
        item.update(dict(tika_status=status, content='', containerid='0'))
        # filemetadata may be None for empty files or after an error in Tika
        if filemetadata is not None:
            # guess content-type
            content_type = self._guess_content_type(filemetadata)
            filemetadata['Content-Type'] = content_type

            # map known fields
            for metadata in filemetadata:
                field = self._map_field(content_type, metadata)
                if field is None:
                    if self.myflag('include_unknown'):
                        item[metadata] = filemetadata[metadata]
                elif field != IGNORE_FIELD:
                    item[field] = filemetadata[metadata]
        # if the content is too large, remove it and set status to 413
        if len(item.get('content', '')) > int(
                self.myconfig('content_max_size')):
            item['content'] = ''
            item['tika_status'] = 413
        # final checks
        for key in item:
            if isinstance(item[key], list):
                # metadata cannot be list
                item[key] = ','.join(item[key])
        # identifier
        item['_id'] = str(generate_id(item))
        return [item]
Esempio n. 5
0
    def run(self, path):
        """
        Returns:
            An iterator with the adapted JSON.
        """
        self.logger().info('Indexing: %s', path)
        self.check_params(path, check_from_module=True)

        name = self.myconfig('name').lower()
        doc_type = self.myconfig('doc_type')

        # save metadata for this execution
        if self.myconfig('rvtindex'):
            metadata = dict(casename=self.myconfig('casename'),
                            source=self.myconfig('source'),
                            started=datetime.datetime.utcnow().isoformat(),
                            path=path,
                            server=self.myconfig('server'),
                            name=name,
                            status='started')
            yield dict(_index=self.myconfig('rvtindex'),
                       _type=doc_type,
                       _id=name,
                       _source=metadata,
                       _op_type='update')
        else:
            metadata = None

        exit_status = ''
        # read tags from the section
        mytags = base.config.parse_conf_array(self.myconfig('tags'))
        try:
            for fileinfo in self.from_module.run(path):
                # save custom tags for this parser, if any
                if mytags:
                    fileinfo['tags'] = mytags
                # get or generate an identifier
                _id = str(generate_id(fileinfo))
                # if the fileinfo already provides an index name, use it. If not, use the default index name
                fileindex = fileinfo.pop(
                    '_index') if '_index' in fileinfo else name
                yield dict(_index=fileindex,
                           _type=doc_type,
                           _id=_id,
                           _source=fileinfo,
                           _op_type=self.myconfig('operation'))
            exit_status = 'ended'
        except base.job.RVTError as exc:
            # After an error, log as a warning and end the module
            import traceback
            tb = traceback.format_exc()
            self.logger().warning(tb)
            self.logger().error(str(exc))
            exit_status = 'error'
        except KeyboardInterrupt:
            # if the module was interrupted
            exit_status = 'interrupted'

        if metadata:
            # register the result of the execution
            metadata = {
                'status': exit_status,
                'ended': datetime.datetime.utcnow().isoformat()
            }
            yield dict(_index=self.myconfig('rvtindex'),
                       _type=doc_type,
                       _id=name,
                       _source=metadata,
                       _op_type='update')