def store_record(self, index_name, doc_name, content): if not self.is_connected(): logger.error('Error. Not connected to Elasticsearch') return if type(index_name) is not str: logger.error('Error. Index name must be a str') return if type(doc_name) is not str: logger.error('Error. Missing document name to store in Elasticsearch') return if not isinstance(content, dict): logger.error('Error. Missing content to store in Elasticsearch') return t1 = time() logger.debug("Gevent (before es_obj.index): '{}'".format(gevent.getcurrent().name)) try: res = self.es.index(index=index_name, doc_type=doc_name, body=content) except Exception as ex: logger.error('Error. Something went wrong storing the data') return else: logger.debug("Gevent (after es_obj.index: '{}' - {}". format(gevent.getcurrent().name, time() - t1)) return res
def connect(self): self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) if self.is_connected(): msg = 'Connected to ElasticSearch on' logger.info('{msg} {host}:{port}'.format(msg=msg, host=self.host, port=self.port)) else: msg = 'Error. Failed to connect to Elasticsearch on' logger.error('{msg} {host}:{port}'.format(msg=msg, host=self.host, port=self.port))
def create_index(self, index_name, mapping=None): if not self.is_connected(): logger.error('Error. Not connected to Elasticsearch') return if type(index_name) is not str: logger.error('Error. Index name must be a str') return if mapping and not isinstance(mapping, dict): logger.error('Error. Mapping must be a dictionary') return try: if not self.es.indices.exists(index_name): # Ignore 400 means to ignore "Index Already Exist" error. res = self.es.indices.create(index=index_name, body=mapping, ignore=[400, 404]) except Exception as ex: logger.error("Error creating the index '{}'.Error: {}". format(index_name, str(ex))) return else: logger.info(("Index '{}' was created successfully"). format(index_name)) return True
def search(self, index_name, content): if not self.is_connected(): logger.error('Error. Not connected to Elasticsearch') return if type(index_name) is not str: logger.error('Error. Index name must be a str') return if type(content) is not str: loggin.error('Error. Content must be a dictionary') return return self.es.search(index=index_name, body=content)
def es_init(es_addr, es_port): es = ES(es_addr, es_port) es.connect() for idx in list(mappings.keys()): if not es.secure_delete_index(idx): logger.error("Error deleting index '{}'".format(idx)) return if not es.create_index(idx, mappings.get(idx, '')): logger.error("Error creating index '{}'".format(idx)) return return
def secure_delete_index(self, index_name): if not self.is_connected(): logger.error('Error. Not connected to Elasticsearch') return if type(index_name) is not str: logger.error('Error. Index name must be a str') return msg = "Do you want to delete the index '{}'?".format(index_name) if self.es.indices.exists(index_name): if utils.query_yes_no(msg, False): res = self.es.indices.delete(index=index_name) logger.info("The index {} was deleted successfully". format(index_name)) return True
def parse_pdf2img(filename, folder_img): try: with tempfile.TemporaryDirectory() as tmppath: images = convert_from_path(filename, dpi=80, fmt='jpeg', strict=False, last_page=10, output_folder=tmppath) utils.create_directory(folder_img) files = os.listdir(tmppath) for file in files: src = os.path.join(tmppath, file) shutil.move(src, folder_img) return True except: logger.error(("pdf2image could not convert " + " the document '{}'").format(filename)) return False
def parse_pdf(root, file_name, file_extension, folder='', encoding='utf-8'): t0 = time() content = {} file_path = os.path.join(root, folder, file_name + file_extension) logger.debug('Gevent (init parse_pdf): {}. File: {}'.format( gevent.getcurrent().name, file_path)) status = 'error' clean_text = '' content = {} exclude_sent_with_words = [] file_exc_words = './exclude_words.txt' if os.path.isfile(file_exc_words): exclude_sent_with_words = utils.read_txt_file(file_exc_words) if file_extension != '.pdf': logger.error("File extension of '{}' is not '.pdf'".format(file_path)) else: eof = subprocess.check_output(['tail', '-n', '1', file_path]) # %%EOF, %%EOF\n, %%EOF\r, %%EOF\r\n eof = eof.replace(b'\r', b'') eof = eof.replace(b'\n', b'') if (b'%%EOF' in eof[-4:]): logger.error("Error reading EOF bytes '{}' from '{}'".format( eof.decode('utf-8'), file_path)) else: t1 = time() pdfinfo = get_pdfinfo(file_path) numpages = pdfinfo.get('pages', -1) logger.debug('Gevent (before textract.process): {}'.format( gevent.getcurrent().name)) try: text = textract.process(file_path, encoding=encoding) except: logger.error( ("Unexpected error while parsing PDF file_path '{}' " + "using textract").format(file_path)) return {'status': status, 'args': file_path, 'data': content} logger.debug('Gevent (after textract.process: {} - {}'.format( gevent.getcurrent().name, time() - t1)) text = text.decode("utf-8") text = utils.remove_non_printable_chars(text) text = text.split('\n') for line in text: if not line and clean_text[-2:] != '\n\n': clean_text += '\n' else: if "disclosure" in line.lower(): break for exc_words in exclude_sent_with_words: if '@' in line.lower(): break if re.search( r'\b' + exc_words.lower().replace('+', '\+') + r'\b', line.lower()): break else: if text.count(line) <= max(numpages - 10, 4): #remove extra spaces clean_line = re.sub(r'\s+', ' ', line) clean_line = utils.remove_nonsense_lines( str(clean_line), 6) if clean_line: clean_text += clean_line + '\n' if not clean_text: logger.error( ("textract was unable to parse " + "the contents of the document '{}'").format(file_path)) return {'status': status, 'args': file_path, 'data': content} summary, freq_words, sentiment = text_summary(clean_text, 20) tags = list(freq_words)[:5] if len(freq_words) > 5 else list( freq_words) clean_text_bytes = bytes(clean_text, encoding=encoding) clean_text_b64str = base64.b64encode(clean_text_bytes).decode( 'utf-8') hash_object = hashlib.sha512(clean_text_bytes) hex_dig = hash_object.hexdigest() content = { 'meta': { 'dir_root': root, 'folder_file': folder, 'filename': file_name, 'extension': file_extension, 'content_sha512_hex': hex_dig, **pdfinfo }, 'content': clean_text, 'content_base64': clean_text_b64str, 'summary': summary, 'created': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'tags': tags, 'sentiment': sentiment } logger.debug('Gevent (end parse_pdf): {} - {}'.format( gevent.getcurrent().name, time() - t0)) if not content: status = 'error' logger.error("Empty content for '{}'".format(file_path)) else: status = 'ok' return {'status': status, 'args': file_path, 'data': content}
def on_exception(greenlet): logger.error("Greenlet '{}' died unexpectedly. Args: '{}'".format( greenlet, greenlet.args))
if not es.create_index(idx, mappings.get(idx, '')): logger.error("Error creating index '{}'".format(idx)) return return if __name__ == '__main__': scheduler = GeventScheduler() config_app = config.get('app') config_es = config.get('elasticsearch') interval = config.get('freq_min', 5) if not config_app: logger.error('Missing: config > app') sys.exit(1) if not config_es: logger.error('Missing: config > elasticsearch') sys.exit(1) es_addr = config_es.get('host', '127.0.01') es_port = config_es.get('port', 9200) dir_root = config_app.get('dir_root') dir_processed = config_app.get('dir_processed') dir_error = config_app.get('dir_errors') if not dir_root: logger.error('Missing: config > app > dir_root') sys.exit(1)