def load_grammars(self): grammars = [] pos_sequences = read_by_line(self.pos_sequences_file) for sequence_str in pos_sequences: grammars.append(sequence_str.replace('\n', '').strip()) return grammars
def load_grammars(self): grammars=[] pos_sequences = read_by_line(self.pos_sequences_file) for sequence_str in pos_sequences: grammars.append(sequence_str.replace('\n','').strip()) return grammars
def __init__(self, config=None, solrClient=None): self._logger = logging.getLogger(__name__) if self.linguistic_processor is None: self.linguistic_processor = LinguisticPreprocessor() if config is None: import configparser config = configparser.ConfigParser() config.read( os.path.join(os.path.dirname(__file__), '..', 'config', 'config')) try: self.pos_sequences_file = config['DEFAULT']['pos_sequence_filter'] except KeyError: self._logger.exception( "Oops! 'pos_sequence_filter' is not found in config file.") raise Exception( "Please check 'pos_sequence_filter' is properly configured!") try: self.solr_core_url = config['DEFAULT']['solr_core_url'] except KeyError: errMsg = "Target index url 'solr_core_url' is not configured in config file. Use default index directory instead." self._logger.exception(errMsg) raise Exception(errMsg) try: self._max_tokens = int(config['DEFAULT']['max_tokens']) except KeyError: errMsg = "'max_tokens' is not configured in config file. Default as 6 instead." self._logger.warn(errMsg) self._max_tokens = 6 try: self._min_tokens = int(config['DEFAULT']['min_tokens']) except KeyError: errMsg = "'min_tokens' is not configured in config file. Default as 1 instead." self._logger.warn(errMsg) self._min_tokens = 6 try: self._min_char_length = int(config['DEFAULT']['min_char_length']) except KeyError: errMsg = "'min_char_length' is not configured in config file. Default as 2 instead." self._logger.warn(errMsg) self._min_char_length = 2 try: self._min_term_freq = int(config['DEFAULT']['min_term_freq']) except KeyError: errMsg = "'min_term_freq' is not configured in config file. Default is 1 instead." self._logger.warning(errMsg) self._min_term_freq = 1 try: self.solr_field_content = config['DEFAULT']['solr_field_content'] except KeyError: errMsg = "'solr_field_content' is not configured in config file. Default field name is 'content'" self._logger.warning(errMsg) self.solr_field_content = "content" if len(self.stopword_list) == 0: from nltk.corpus import stopwords self.stopword_list = set() #The union operator is much faster than add self.stopword_list |= set(stopwords.words('english')) try: customised_stopword_file = config['DEFAULT']['stopwords'] except KeyError: errMsg = "Oops! customisable stopword file is not found in config file. Use default english stopword list instead!" self._logger.error(errMsg) smart_stopword_list = os.path.join(os.path.dirname(__file__), '..', 'config', 'smart-stop-list.txt') if (customised_stopword_file is not None): self.stopword_list |= set( read_by_line(customised_stopword_file)) self.stopword_list |= set(read_by_line(smart_stopword_list)) self._logger.debug("final stopword size: [%s]", len(self.stopword_list)) #dict_term will be loaded for dictionary matching #if len(self.dict_terms) == 0: # self.dict_terms=set() if solrClient is None: from SolrClient import SolrClient self.solrClient = SolrClient(self.solr_core_url) else: self.solrClient = solrClient self.load_dictionary_tagging_setting(config) try: self.parallel_workers = config['DEFAULT']['PARALLEL_WORKERS'] except KeyError: self._logger.exception( "Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead." ) #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!") self.parallel_workers = 1
def __init__(self, config=None, solrClient=None): self._logger=logging.getLogger(__name__) if self.linguistic_processor is None: self.linguistic_processor = LinguisticPreprocessor() if config is None: import configparser config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), '..', 'config','config')) try: self.pos_sequences_file=config['DEFAULT']['pos_sequence_filter'] except KeyError: self._logger.exception("Oops! 'pos_sequence_filter' is not found in config file.") raise Exception("Please check 'pos_sequence_filter' is properly configured!") try: self.solr_core_url=config['DEFAULT']['solr_core_url'] except KeyError: errMsg="Target index url 'solr_core_url' is not configured in config file. Use default index directory instead." self._logger.exception(errMsg) raise Exception(errMsg) try: self._max_tokens=int(config['DEFAULT']['max_tokens']) except KeyError: errMsg="'max_tokens' is not configured in config file. Default as 6 instead." self._logger.warn(errMsg) self._max_tokens=6 try: self._min_tokens=int(config['DEFAULT']['min_tokens']) except KeyError: errMsg="'min_tokens' is not configured in config file. Default as 1 instead." self._logger.warn(errMsg) self._min_tokens=6 try: self._min_char_length=int(config['DEFAULT']['min_char_length']) except KeyError: errMsg="'min_char_length' is not configured in config file. Default as 2 instead." self._logger.warn(errMsg) self._min_char_length=2 try: self._min_term_freq=int(config['DEFAULT']['min_term_freq']) except KeyError: errMsg="'min_term_freq' is not configured in config file. Default is 1 instead." self._logger.warning(errMsg) self._min_term_freq=1 try: self.solr_field_content=config['DEFAULT']['solr_field_content'] except KeyError: errMsg="'solr_field_content' is not configured in config file. Default field name is 'content'" self._logger.warning(errMsg) self.solr_field_content="content" if len(self.stopword_list) == 0 : from nltk.corpus import stopwords self.stopword_list=set() #The union operator is much faster than add self.stopword_list |= set(stopwords.words('english')) try: customised_stopword_file=config['DEFAULT']['stopwords'] except KeyError: errMsg="Oops! customisable stopword file is not found in config file. Use default english stopword list instead!" self._logger.error(errMsg) smart_stopword_list=os.path.join(os.path.dirname(__file__), '..','config','smart-stop-list.txt') if (customised_stopword_file is not None): self.stopword_list |= set(read_by_line(customised_stopword_file)) self.stopword_list |= set(read_by_line(smart_stopword_list)) self._logger.debug("final stopword size: [%s]", len(self.stopword_list)) #dict_term will be loaded for dictionary matching #if len(self.dict_terms) == 0: # self.dict_terms=set() if solrClient is None: from SolrClient import SolrClient self.solrClient=SolrClient(self.solr_core_url) else: self.solrClient=solrClient self.load_dictionary_tagging_setting(config) try: self.parallel_workers=config['DEFAULT']['PARALLEL_WORKERS'] except KeyError: self._logger.exception("Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead.") #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!") self.parallel_workers = 1