def _setContainerID(self, d, containerid=None): if not containerid: d['_id'] = str(generate_id(d)) d['containerid'] = 0 else: d['_id'] = str(generate_id(d)) d['containerid'] = containerid return d['_id']
def upload_avatar(self, img): avatar_id = generate_id() filename = os.path.join(UPLOAD_FOLDER, 'avatars', '{}.png'.format(avatar_id)) if isinstance(img, str) and img.startswith('http'): r = requests.get(img, stream=True) if r.status_code == 200: with open(filename, 'wb') as f: for chunk in r.iter_content(1024): f.write(chunk) else: img.save(filename) self.update_avatar(avatar_id)
def run(self, path): """ Returns: An iterator with the adapted JSON. """ self.logger().info('Indexing: %s', path) self.check_params(path, check_from_module=True) name = self.myconfig('name').lower() doc_type = self.myconfig('doc_type') exit_status = '' # read tags from the section mytags = base.config.parse_conf_array(self.myconfig('tags')) try: for fileinfo in self.from_module.run(path): # save custom tags for this parser, if any if mytags: fileinfo['tags'] = mytags # get or generate an identifier _id = str(generate_id(fileinfo)) # if the fileinfo already provides an index name, use it. If not, use the default index name fileindex = fileinfo.pop( '_index') if '_index' in fileinfo else name yield dict(_index=fileindex, _type=doc_type, _id=_id, _source=fileinfo, _op_type=self.myconfig('operation')) exit_status = 'ended' except base.job.RVTError as exc: # After an error, log as a warning and end the module import traceback tb = traceback.format_exc() self.logger().warning(tb) self.logger().error(str(exc)) exit_status = 'error' except KeyboardInterrupt: # if the module was interrupted exit_status = 'interrupted'
def _post_parse_file_single(self, filepath, filemetadata, status=200): """ Gets the output from TIKA, maps fields and remove content if it is too large. Args: filepath (str): the path to the file filemetatada (dict): a dictionary of metadata, as returned by tika status (int): the status returned by tika server (200=OK) Returns: A list with a single item, which contains the file metadata. """ item = self._common_fields(filepath) item.update(dict(tika_status=status, content='', containerid='0')) # filemetadata may be None for empty files or after an error in Tika if filemetadata is not None: # guess content-type content_type = self._guess_content_type(filemetadata) filemetadata['Content-Type'] = content_type # map known fields for metadata in filemetadata: field = self._map_field(content_type, metadata) if field is None: if self.myflag('include_unknown'): item[metadata] = filemetadata[metadata] elif field != IGNORE_FIELD: item[field] = filemetadata[metadata] # if the content is too large, remove it and set status to 413 if len(item.get('content', '')) > int( self.myconfig('content_max_size')): item['content'] = '' item['tika_status'] = 413 # final checks for key in item: if isinstance(item[key], list): # metadata cannot be list item[key] = ','.join(item[key]) # identifier item['_id'] = str(generate_id(item)) return [item]
def run(self, path): """ Returns: An iterator with the adapted JSON. """ self.logger().info('Indexing: %s', path) self.check_params(path, check_from_module=True) name = self.myconfig('name').lower() doc_type = self.myconfig('doc_type') # save metadata for this execution if self.myconfig('rvtindex'): metadata = dict(casename=self.myconfig('casename'), source=self.myconfig('source'), started=datetime.datetime.utcnow().isoformat(), path=path, server=self.myconfig('server'), name=name, status='started') yield dict(_index=self.myconfig('rvtindex'), _type=doc_type, _id=name, _source=metadata, _op_type='update') else: metadata = None exit_status = '' # read tags from the section mytags = base.config.parse_conf_array(self.myconfig('tags')) try: for fileinfo in self.from_module.run(path): # save custom tags for this parser, if any if mytags: fileinfo['tags'] = mytags # get or generate an identifier _id = str(generate_id(fileinfo)) # if the fileinfo already provides an index name, use it. If not, use the default index name fileindex = fileinfo.pop( '_index') if '_index' in fileinfo else name yield dict(_index=fileindex, _type=doc_type, _id=_id, _source=fileinfo, _op_type=self.myconfig('operation')) exit_status = 'ended' except base.job.RVTError as exc: # After an error, log as a warning and end the module import traceback tb = traceback.format_exc() self.logger().warning(tb) self.logger().error(str(exc)) exit_status = 'error' except KeyboardInterrupt: # if the module was interrupted exit_status = 'interrupted' if metadata: # register the result of the execution metadata = { 'status': exit_status, 'ended': datetime.datetime.utcnow().isoformat() } yield dict(_index=self.myconfig('rvtindex'), _type=doc_type, _id=name, _source=metadata, _op_type='update')