def Parse( self, communication_log, ): try: self.communication_log = communication_log lexicon = db.Lexicon.GetByEnterprise( self.communication_log['enterprise_id']) keywords = {} for l in lexicon: for k in l['keywords']: if k and not k.isspace(): keywords[k.strip()] = 1 if len(keywords) > 0: self.keywords_regex = re.compile('|'.join(keywords.keys()), re.IGNORECASE) else: self.keywords_regex = re.compile('---', re.IGNORECASE) LOG.warning('Lexicon is empty for enterprise_id=' + self.communication_log['enterprise_id']) db.Conversations.DeleteByCommunicationLogId( self.communication_log['_id']) db.Communications.DeleteByCommunicationLogId( self.communication_log['_id']) self.file_path = db.CommunicationLogs.GetFilePath( communication_log) self.conversations = [] self.fill_conversations() for communications in self.conversations: self.__save_conversation(communications) db.CommunicationLogs.SetParsed(communication_log['_id']) except: LOG.exception(sys.exc_info()[0]) db.CommunicationLogs.SetParsed(communication_log['_id'], sys.exc_info()[0])
def scan_directory(path: str, base_name: str) -> Optional[Files]: """ Получить пути до файлов-резюме и до главного файла-базы. :param path: путь до директории с данными :type path: str :param path: имя файла, содержащего базу :type path: str :return: датакласс с данными :rtype: Optional[Files] """ dirs = {} base_fp = None for element in os.scandir(path): if element.is_file() and element.name == base_name: base_fp = element.path if os.path.isdir(element.path): dirs.update(get_info(element)) if base_fp: files = Files(base_fp, dirs) LOG.debug(f"Get basefile and resume") LOG.debug(f"{files}") return files LOG.warning(f"Cannot find basefile")
def valid_data(flat_info): """Check if extracted flat info valid. Return boolean.""" valid = True description, postcode_area, bedrooms, price, website = flat_info if not description: LOG.warning(f'No description found: {website}. Skipping div') valid = False if postcode_area not in ('EH6', 'EH7', ''): LOG.info( f'{postcode_area} not a valid postcode: {website}. Skipping div') valid = False if bedrooms <= 0: LOG.warning( f'{bedrooms} not a valid num of bedrooms: {website}. Skipping div') valid = False if price <= 0: LOG.warning(f'{price} not a valid price: {website}. Skipping div') valid = False return valid
source = sys.argv[2] if source in ['*', '.', '%']: source = None import db import settings for source_name in settings.COMMUNICATION_LOG_SOURCES: if source and source_name != source: continue answer = raw_input( 'ATTENTION! Dropping all the data for source: ' + source_name + '. Proceed? [y/n]').lower() if answer != 'y': print 'Canceled' continue LOG.warning('Dropping Conversations for source: ' + source_name) db.Conversations.DeleteBySource(source_name) LOG.warning('Dropping Communications for source: ' + source_name) db.Communications.DeleteBySource(source_name) LOG.warning('Dropping CommunicationLogs for source: ' + source_name) db.CommunicationLogs.DeleteBySource(source_name) import os directory = settings.DOWNLOAD_DIR + '/' + source_name LOG.warning('Deleting: ' + directory) if os.path.exists(directory): import shutil shutil.rmtree(directory) else: print('Unknown command: ' + sys.argv[1])
def starter_(self, count_connection_attempt): try: if self.disposing: return #stop and clean everything before [re-]starting self.run_kinesis_stream_reader = False if self.kinesis_stream: try: self.kinesis_stream.close() except: LOG.exception(sys.exc_info()[0]) self.kinesis_stream = None if self.libav_input_descriptor: try: #(it was checked:) after closing pipe input, libav will still read all the packets until EOF LOG.info('Closing libav_input_descriptor...') self.libav_input_descriptor.close() except: LOG.exception(sys.exc_info()[0]) self.libav_input_descriptor = None if count_connection_attempt: self.connection_attempts_count += 1 else: self.connection_renewals_count += 1 if self.connection_attempts_count > self.reconnect_max_count: LOG.warning('Stopping because reconnect count exceeded %d...' % self.reconnect_max_count) return self.set_kinesis_stream() if self.libav_parser_thread: LOG.info( "Wating libav_parser_thread to read remaining packets and stop..." ) self.libav_parser_thread.join() self.libav_parser_thread = None LOG.info("libav_parser_thread has been stopped.") self.libav_output_reader = None if self.kinesis_stream_reader_thread: LOG.info("Wating kinesis_stream_reader_thread to stop...") self.kinesis_stream_reader_thread.join() self.kinesis_stream_reader_thread = None LOG.info("kinesis_stream_reader_thread has been stopped.") with self.tags_lock: self.tags_line = [] self.last_packet_tags = None #the main use apart, it shows if at least one packet was read after [re-]connection if os.path.exists(self.kinesis_stream_pipe): os.remove( self.kinesis_stream_pipe ) #clean the pipe if it remains with data after Parser interruption os.mkfifo(self.kinesis_stream_pipe) self.run_kinesis_stream_reader = True self.kinesis_stream_reader_thread = Thread( target=self.kinesis_stream_reader, args=()) self.kinesis_stream_reader_thread.daemon = True self.kinesis_stream_reader_thread.start() self.run_libav_parser = True self.libav_parser_thread = Thread(target=self.libav_parser, args=()) self.libav_parser_thread.daemon = True self.libav_parser_thread.start() except: LOG.exception(sys.exc_info()[0]) self.dispose() finally: pass