Exemple #1
0
 def store_record(self, index_name, doc_name, content):
     
     if not self.is_connected():
         logger.error('Error. Not connected to Elasticsearch')
         return
     
     if type(index_name) is not str:
         logger.error('Error. Index name must be a str')
         return
     
     if type(doc_name) is not str:
         logger.error('Error. Missing document name to store in Elasticsearch')
         return
     
     if not isinstance(content, dict):
         logger.error('Error. Missing content to store in Elasticsearch')
         return
     
     t1 = time()
     logger.debug("Gevent (before es_obj.index): '{}'".format(gevent.getcurrent().name))
     
     try:
         res = self.es.index(index=index_name, doc_type=doc_name, body=content)
     except Exception as ex:
         logger.error('Error. Something went wrong storing the data')
         return
     else:
         logger.debug("Gevent (after es_obj.index: '{}' - {}".
             format(gevent.getcurrent().name, time() - t1))
         return res
Exemple #2
0
 def connect(self):
     self.es = Elasticsearch([{'host': self.host, 'port': self.port}])
     if self.is_connected():
         msg = 'Connected to ElasticSearch on'
         logger.info('{msg} {host}:{port}'.format(msg=msg, host=self.host,
                                                   port=self.port))
     else:
         msg = 'Error. Failed to connect to Elasticsearch on'
         logger.error('{msg} {host}:{port}'.format(msg=msg, host=self.host,
                                                    port=self.port))
Exemple #3
0
    def create_index(self, index_name, mapping=None):
        
        if not self.is_connected():
            logger.error('Error. Not connected to Elasticsearch')
            return
        
        if type(index_name) is not str:
            logger.error('Error. Index name must be a str')
            return
        
        if mapping and not isinstance(mapping, dict):
            logger.error('Error. Mapping must be a dictionary')
            return

        try:
            if not self.es.indices.exists(index_name):
                # Ignore 400 means to ignore "Index Already Exist" error.
                res = self.es.indices.create(index=index_name,
                                             body=mapping,
                                             ignore=[400, 404])
        except Exception as ex:
            logger.error("Error creating the index '{}'.Error: {}".
                         format(index_name, str(ex)))
            return
        else:
            logger.info(("Index '{}' was created successfully").
                            format(index_name))
            return True
Exemple #4
0
    def search(self, index_name, content):
        
        if not self.is_connected():
            logger.error('Error. Not connected to Elasticsearch')
            return
        
        if type(index_name) is not str:
            logger.error('Error. Index name must be a str')
            return

        if type(content) is not str:
            loggin.error('Error. Content must be a dictionary')
            return
        
        return self.es.search(index=index_name, body=content)
Exemple #5
0
def es_init(es_addr, es_port):

    es = ES(es_addr, es_port)
    es.connect()

    for idx in list(mappings.keys()):
        if not es.secure_delete_index(idx):
            logger.error("Error deleting index '{}'".format(idx))
            return

        if not es.create_index(idx, mappings.get(idx, '')):
            logger.error("Error creating index '{}'".format(idx))
            return

    return
Exemple #6
0
    def secure_delete_index(self, index_name):
        
        if not self.is_connected():
            logger.error('Error. Not connected to Elasticsearch')
            return

        if type(index_name) is not str:
            logger.error('Error. Index name must be a str')
            return

        msg = "Do you want to delete the index '{}'?".format(index_name)
        if self.es.indices.exists(index_name):
            if utils.query_yes_no(msg, False):
                res = self.es.indices.delete(index=index_name)
                logger.info("The index {} was deleted successfully".
                             format(index_name))
        return True
Exemple #7
0
def parse_pdf2img(filename, folder_img):
    try:
        with tempfile.TemporaryDirectory() as tmppath:
            images = convert_from_path(filename,
                                       dpi=80,
                                       fmt='jpeg',
                                       strict=False,
                                       last_page=10,
                                       output_folder=tmppath)

            utils.create_directory(folder_img)
            files = os.listdir(tmppath)
            for file in files:
                src = os.path.join(tmppath, file)
                shutil.move(src, folder_img)
        return True
    except:
        logger.error(("pdf2image could not convert " +
                      " the document '{}'").format(filename))
        return False
Exemple #8
0
def parse_pdf(root, file_name, file_extension, folder='', encoding='utf-8'):

    t0 = time()
    content = {}
    file_path = os.path.join(root, folder, file_name + file_extension)
    logger.debug('Gevent (init parse_pdf): {}. File: {}'.format(
        gevent.getcurrent().name, file_path))

    status = 'error'
    clean_text = ''
    content = {}
    exclude_sent_with_words = []

    file_exc_words = './exclude_words.txt'
    if os.path.isfile(file_exc_words):
        exclude_sent_with_words = utils.read_txt_file(file_exc_words)

    if file_extension != '.pdf':
        logger.error("File extension of '{}' is not '.pdf'".format(file_path))

    else:

        eof = subprocess.check_output(['tail', '-n', '1', file_path])
        # %%EOF, %%EOF\n, %%EOF\r, %%EOF\r\n
        eof = eof.replace(b'\r', b'')
        eof = eof.replace(b'\n', b'')
        if (b'%%EOF' in eof[-4:]):
            logger.error("Error reading EOF bytes '{}' from '{}'".format(
                eof.decode('utf-8'), file_path))
        else:

            t1 = time()

            pdfinfo = get_pdfinfo(file_path)
            numpages = pdfinfo.get('pages', -1)

            logger.debug('Gevent (before textract.process): {}'.format(
                gevent.getcurrent().name))
            try:
                text = textract.process(file_path, encoding=encoding)

            except:
                logger.error(
                    ("Unexpected error while parsing PDF file_path '{}' " +
                     "using textract").format(file_path))
                return {'status': status, 'args': file_path, 'data': content}

            logger.debug('Gevent (after textract.process: {} - {}'.format(
                gevent.getcurrent().name,
                time() - t1))

            text = text.decode("utf-8")
            text = utils.remove_non_printable_chars(text)
            text = text.split('\n')

            for line in text:
                if not line and clean_text[-2:] != '\n\n':
                    clean_text += '\n'
                else:
                    if "disclosure" in line.lower(): break
                    for exc_words in exclude_sent_with_words:
                        if '@' in line.lower(): break
                        if re.search(
                                r'\b' + exc_words.lower().replace('+', '\+') +
                                r'\b', line.lower()):
                            break
                    else:
                        if text.count(line) <= max(numpages - 10, 4):
                            #remove extra spaces
                            clean_line = re.sub(r'\s+', ' ', line)
                            clean_line = utils.remove_nonsense_lines(
                                str(clean_line), 6)
                            if clean_line:
                                clean_text += clean_line + '\n'

            if not clean_text:
                logger.error(
                    ("textract was unable to parse " +
                     "the contents of the document '{}'").format(file_path))
                return {'status': status, 'args': file_path, 'data': content}

            summary, freq_words, sentiment = text_summary(clean_text, 20)
            tags = list(freq_words)[:5] if len(freq_words) > 5 else list(
                freq_words)

            clean_text_bytes = bytes(clean_text, encoding=encoding)
            clean_text_b64str = base64.b64encode(clean_text_bytes).decode(
                'utf-8')
            hash_object = hashlib.sha512(clean_text_bytes)
            hex_dig = hash_object.hexdigest()

            content = {
                'meta': {
                    'dir_root': root,
                    'folder_file': folder,
                    'filename': file_name,
                    'extension': file_extension,
                    'content_sha512_hex': hex_dig,
                    **pdfinfo
                },
                'content': clean_text,
                'content_base64': clean_text_b64str,
                'summary': summary,
                'created': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'tags': tags,
                'sentiment': sentiment
            }

            logger.debug('Gevent (end parse_pdf): {} - {}'.format(
                gevent.getcurrent().name,
                time() - t0))

            if not content:
                status = 'error'
                logger.error("Empty content for '{}'".format(file_path))
            else:
                status = 'ok'

    return {'status': status, 'args': file_path, 'data': content}
Exemple #9
0
def on_exception(greenlet):
    logger.error("Greenlet '{}' died unexpectedly. Args: '{}'".format(
        greenlet, greenlet.args))
Exemple #10
0
        if not es.create_index(idx, mappings.get(idx, '')):
            logger.error("Error creating index '{}'".format(idx))
            return

    return


if __name__ == '__main__':

    scheduler = GeventScheduler()
    config_app = config.get('app')
    config_es = config.get('elasticsearch')
    interval = config.get('freq_min', 5)

    if not config_app:
        logger.error('Missing: config > app')
        sys.exit(1)

    if not config_es:
        logger.error('Missing: config > elasticsearch')
        sys.exit(1)

    es_addr = config_es.get('host', '127.0.01')
    es_port = config_es.get('port', 9200)
    dir_root = config_app.get('dir_root')
    dir_processed = config_app.get('dir_processed')
    dir_error = config_app.get('dir_errors')

    if not dir_root:
        logger.error('Missing: config > app > dir_root')
        sys.exit(1)