Exemple #1
0
    def load_grammars(self):
        grammars = []

        pos_sequences = read_by_line(self.pos_sequences_file)
        for sequence_str in pos_sequences:
            grammars.append(sequence_str.replace('\n', '').strip())

        return grammars
 def load_grammars(self):
     grammars=[]
     
     pos_sequences = read_by_line(self.pos_sequences_file)
     for sequence_str in pos_sequences:
         grammars.append(sequence_str.replace('\n','').strip())
     
     return grammars
Exemple #3
0
    def __init__(self, config=None, solrClient=None):
        self._logger = logging.getLogger(__name__)

        if self.linguistic_processor is None:
            self.linguistic_processor = LinguisticPreprocessor()

        if config is None:
            import configparser
            config = configparser.ConfigParser()
            config.read(
                os.path.join(os.path.dirname(__file__), '..', 'config',
                             'config'))

        try:
            self.pos_sequences_file = config['DEFAULT']['pos_sequence_filter']
        except KeyError:
            self._logger.exception(
                "Oops! 'pos_sequence_filter' is not found in config file.")
            raise Exception(
                "Please check 'pos_sequence_filter' is properly configured!")
        try:
            self.solr_core_url = config['DEFAULT']['solr_core_url']
        except KeyError:
            errMsg = "Target index url 'solr_core_url' is not configured in config file. Use default index directory instead."
            self._logger.exception(errMsg)
            raise Exception(errMsg)

        try:
            self._max_tokens = int(config['DEFAULT']['max_tokens'])
        except KeyError:
            errMsg = "'max_tokens' is not configured in config file. Default as 6 instead."
            self._logger.warn(errMsg)
            self._max_tokens = 6

        try:
            self._min_tokens = int(config['DEFAULT']['min_tokens'])
        except KeyError:
            errMsg = "'min_tokens' is not configured in config file. Default as 1 instead."
            self._logger.warn(errMsg)
            self._min_tokens = 6

        try:
            self._min_char_length = int(config['DEFAULT']['min_char_length'])
        except KeyError:
            errMsg = "'min_char_length' is not configured in config file. Default as 2 instead."
            self._logger.warn(errMsg)
            self._min_char_length = 2

        try:
            self._min_term_freq = int(config['DEFAULT']['min_term_freq'])
        except KeyError:
            errMsg = "'min_term_freq' is not configured in config file. Default is 1 instead."
            self._logger.warning(errMsg)
            self._min_term_freq = 1

        try:
            self.solr_field_content = config['DEFAULT']['solr_field_content']
        except KeyError:
            errMsg = "'solr_field_content' is not configured in config file. Default field name is 'content'"
            self._logger.warning(errMsg)
            self.solr_field_content = "content"

        if len(self.stopword_list) == 0:
            from nltk.corpus import stopwords
            self.stopword_list = set()
            #The union operator is much faster than add
            self.stopword_list |= set(stopwords.words('english'))
            try:
                customised_stopword_file = config['DEFAULT']['stopwords']
            except KeyError:
                errMsg = "Oops! customisable stopword file is not found in config file. Use default english stopword list instead!"
                self._logger.error(errMsg)

            smart_stopword_list = os.path.join(os.path.dirname(__file__), '..',
                                               'config', 'smart-stop-list.txt')
            if (customised_stopword_file is not None):
                self.stopword_list |= set(
                    read_by_line(customised_stopword_file))
                self.stopword_list |= set(read_by_line(smart_stopword_list))

            self._logger.debug("final stopword size: [%s]",
                               len(self.stopword_list))

            #dict_term will be loaded for dictionary matching
            #if len(self.dict_terms) == 0:
            #    self.dict_terms=set()

        if solrClient is None:
            from SolrClient import SolrClient
            self.solrClient = SolrClient(self.solr_core_url)
        else:
            self.solrClient = solrClient

        self.load_dictionary_tagging_setting(config)

        try:
            self.parallel_workers = config['DEFAULT']['PARALLEL_WORKERS']
        except KeyError:
            self._logger.exception(
                "Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead."
            )
            #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!")
            self.parallel_workers = 1
 def __init__(self, config=None, solrClient=None):
     self._logger=logging.getLogger(__name__)
     
     if self.linguistic_processor is None:            
         self.linguistic_processor = LinguisticPreprocessor()
         
     if config is None:
         import configparser
         config = configparser.ConfigParser()
         config.read(os.path.join(os.path.dirname(__file__), '..', 'config','config'))
     
     try:
         self.pos_sequences_file=config['DEFAULT']['pos_sequence_filter']
     except KeyError:
         self._logger.exception("Oops! 'pos_sequence_filter' is not found in config file.")
         raise Exception("Please check 'pos_sequence_filter' is properly configured!")            
     try:
         self.solr_core_url=config['DEFAULT']['solr_core_url']
     except KeyError:
         errMsg="Target index url 'solr_core_url' is not configured in config file. Use default index directory instead."
         self._logger.exception(errMsg)
         raise Exception(errMsg)
     
     try:
         self._max_tokens=int(config['DEFAULT']['max_tokens'])
     except KeyError:
         errMsg="'max_tokens' is not configured in config file. Default as 6 instead."
         self._logger.warn(errMsg)
         self._max_tokens=6
     
     try:
         self._min_tokens=int(config['DEFAULT']['min_tokens'])
     except KeyError:
         errMsg="'min_tokens' is not configured in config file. Default as 1 instead."
         self._logger.warn(errMsg)
         self._min_tokens=6
     
     try:
         self._min_char_length=int(config['DEFAULT']['min_char_length'])
     except KeyError:
         errMsg="'min_char_length' is not configured in config file. Default as 2 instead."
         self._logger.warn(errMsg)
         self._min_char_length=2
         
     try:
         self._min_term_freq=int(config['DEFAULT']['min_term_freq'])
     except KeyError:
         errMsg="'min_term_freq' is not configured in config file. Default is 1 instead."
         self._logger.warning(errMsg)
         self._min_term_freq=1
     
     try:
         self.solr_field_content=config['DEFAULT']['solr_field_content']
     except KeyError:
         errMsg="'solr_field_content' is not configured in config file. Default field name is 'content'"
         self._logger.warning(errMsg)
         self.solr_field_content="content"
     
     if len(self.stopword_list) == 0 :
         from nltk.corpus import stopwords
         self.stopword_list=set()
         #The union operator is much faster than add
         self.stopword_list |= set(stopwords.words('english'))
         try:
             customised_stopword_file=config['DEFAULT']['stopwords']
         except KeyError:
             errMsg="Oops! customisable stopword file is not found in config file. Use default english stopword list instead!"
             self._logger.error(errMsg)
         
         smart_stopword_list=os.path.join(os.path.dirname(__file__), '..','config','smart-stop-list.txt')
         if (customised_stopword_file is not None):                    
             self.stopword_list |= set(read_by_line(customised_stopword_file))
             self.stopword_list |= set(read_by_line(smart_stopword_list))
         
         self._logger.debug("final stopword size: [%s]", len(self.stopword_list))
         
         #dict_term will be loaded for dictionary matching
         #if len(self.dict_terms) == 0:
         #    self.dict_terms=set()
         
     if solrClient is None:
         from SolrClient import SolrClient
         self.solrClient=SolrClient(self.solr_core_url)
     else:
         self.solrClient=solrClient
     
     self.load_dictionary_tagging_setting(config)
     
     try:
         self.parallel_workers=config['DEFAULT']['PARALLEL_WORKERS']
     except KeyError:
         self._logger.exception("Oops! 'PARALLEL_WORKERS' is not found in config file. Running with 1 worker instead.")
         #raise Exception("Please check 'PARALLEL_WORKERS' is properly configured!")
         self.parallel_workers = 1