Beispiel #1
0
    def Parse(
        self,
        communication_log,
    ):
        try:
            self.communication_log = communication_log

            lexicon = db.Lexicon.GetByEnterprise(
                self.communication_log['enterprise_id'])
            keywords = {}
            for l in lexicon:
                for k in l['keywords']:
                    if k and not k.isspace():
                        keywords[k.strip()] = 1
            if len(keywords) > 0:
                self.keywords_regex = re.compile('|'.join(keywords.keys()),
                                                 re.IGNORECASE)
            else:
                self.keywords_regex = re.compile('---', re.IGNORECASE)
                LOG.warning('Lexicon is empty for enterprise_id=' +
                            self.communication_log['enterprise_id'])

            db.Conversations.DeleteByCommunicationLogId(
                self.communication_log['_id'])
            db.Communications.DeleteByCommunicationLogId(
                self.communication_log['_id'])

            self.file_path = db.CommunicationLogs.GetFilePath(
                communication_log)
            self.conversations = []
            self.fill_conversations()
            for communications in self.conversations:
                self.__save_conversation(communications)
            db.CommunicationLogs.SetParsed(communication_log['_id'])
        except:
            LOG.exception(sys.exc_info()[0])
            db.CommunicationLogs.SetParsed(communication_log['_id'],
                                           sys.exc_info()[0])
Beispiel #2
0
def scan_directory(path: str, base_name: str) -> Optional[Files]:
    """ Получить пути до файлов-резюме и до главного файла-базы.

    :param path: путь до директории с данными
    :type path: str
    :param path: имя файла, содержащего базу
    :type path: str
    :return: датакласс с данными
    :rtype: Optional[Files]
    """
    dirs = {}
    base_fp = None
    for element in os.scandir(path):
        if element.is_file() and element.name == base_name:
            base_fp = element.path
        if os.path.isdir(element.path):
            dirs.update(get_info(element))
    if base_fp:
        files = Files(base_fp, dirs)
        LOG.debug(f"Get basefile and resume")
        LOG.debug(f"{files}")
        return files
    LOG.warning(f"Cannot find basefile")
def valid_data(flat_info):
    """Check if extracted flat info valid. Return boolean."""
    valid = True
    description, postcode_area, bedrooms, price, website = flat_info
    if not description:
        LOG.warning(f'No description found: {website}. Skipping div')
        valid = False
    if postcode_area not in ('EH6', 'EH7', ''):
        LOG.info(
            f'{postcode_area} not a valid postcode: {website}. Skipping div')
        valid = False
    if bedrooms <= 0:
        LOG.warning(
            f'{bedrooms} not a valid num of bedrooms: {website}. Skipping div')
        valid = False
    if price <= 0:
        LOG.warning(f'{price} not a valid price: {website}. Skipping div')
        valid = False
    return valid
Beispiel #4
0
        source = sys.argv[2]
        if source in ['*', '.', '%']:
            source = None

        import db
        import settings
        for source_name in settings.COMMUNICATION_LOG_SOURCES:
            if source and source_name != source:
                continue
            answer = raw_input(
                'ATTENTION! Dropping all the data for source: ' + source_name +
                '. Proceed? [y/n]').lower()
            if answer != 'y':
                print 'Canceled'
                continue
            LOG.warning('Dropping Conversations for source: ' + source_name)
            db.Conversations.DeleteBySource(source_name)
            LOG.warning('Dropping Communications for source: ' + source_name)
            db.Communications.DeleteBySource(source_name)
            LOG.warning('Dropping CommunicationLogs for source: ' +
                        source_name)
            db.CommunicationLogs.DeleteBySource(source_name)
            import os
            directory = settings.DOWNLOAD_DIR + '/' + source_name
            LOG.warning('Deleting: ' + directory)
            if os.path.exists(directory):
                import shutil
                shutil.rmtree(directory)
    else:
        print('Unknown command: ' + sys.argv[1])
    def starter_(self, count_connection_attempt):
        try:
            if self.disposing:
                return

            #stop and clean everything before [re-]starting

            self.run_kinesis_stream_reader = False
            if self.kinesis_stream:
                try:
                    self.kinesis_stream.close()
                except:
                    LOG.exception(sys.exc_info()[0])
                self.kinesis_stream = None

            if self.libav_input_descriptor:
                try:
                    #(it was checked:) after closing pipe input, libav will still read all the packets until EOF
                    LOG.info('Closing libav_input_descriptor...')
                    self.libav_input_descriptor.close()
                except:
                    LOG.exception(sys.exc_info()[0])
                self.libav_input_descriptor = None

            if count_connection_attempt:
                self.connection_attempts_count += 1
            else:
                self.connection_renewals_count += 1
            if self.connection_attempts_count > self.reconnect_max_count:
                LOG.warning('Stopping because reconnect count exceeded %d...' %
                            self.reconnect_max_count)
                return

            self.set_kinesis_stream()

            if self.libav_parser_thread:
                LOG.info(
                    "Wating libav_parser_thread to read remaining packets and stop..."
                )
                self.libav_parser_thread.join()
                self.libav_parser_thread = None
                LOG.info("libav_parser_thread has been stopped.")
            self.libav_output_reader = None

            if self.kinesis_stream_reader_thread:
                LOG.info("Wating kinesis_stream_reader_thread to stop...")
                self.kinesis_stream_reader_thread.join()
                self.kinesis_stream_reader_thread = None
                LOG.info("kinesis_stream_reader_thread has been stopped.")

            with self.tags_lock:
                self.tags_line = []
                self.last_packet_tags = None  #the main use apart, it shows if at least one packet was read after [re-]connection

            if os.path.exists(self.kinesis_stream_pipe):
                os.remove(
                    self.kinesis_stream_pipe
                )  #clean the pipe if it remains with data after Parser interruption
            os.mkfifo(self.kinesis_stream_pipe)

            self.run_kinesis_stream_reader = True
            self.kinesis_stream_reader_thread = Thread(
                target=self.kinesis_stream_reader, args=())
            self.kinesis_stream_reader_thread.daemon = True
            self.kinesis_stream_reader_thread.start()

            self.run_libav_parser = True
            self.libav_parser_thread = Thread(target=self.libav_parser,
                                              args=())
            self.libav_parser_thread.daemon = True
            self.libav_parser_thread.start()

        except:
            LOG.exception(sys.exc_info()[0])
            self.dispose()

        finally:
            pass