Python TMUtilsMatching.check_stopwords 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: TMMatching.TMUtilsMatching

클래스/타입: TMUtilsMatching

메소드/함수: check_stopwords

hotexamples.com에서의 예제들: 6

Python TMUtilsMatching.check_stopwords - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 TMMatching.TMUtilsMatching.TMUtilsMatching.check_stopwords에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

pre_process(12)

check_stopwords(6)

segment_2_universal(4)

_edit_distance(3)

_ter_score(3)

len_compare(2)

pos_bool(2)

position_distance(2)

strip_tags(2)

transfer_tags(2)

empty_list(1)

reduce_tags(1)

ter_distance(1)

un_match_distance(1)

예제 #1

파일 보기

파일: TMSplit.py 프로젝트: MittagQI/nectm

    def __init__(self):
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  #subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'ES'
        )  #TMTextProcessors.stop_words('spanish')#stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'ES')  #TMUniversalPosTag('ES')

        self.rules = {
            'conjunction':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD><.*>*<V.*><.*>*',  #, '<CC|CCNEG|CCAD>','',''
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD><.*>*<V.*><.*>*'
            ),  #'?!<V.*>', '<CC|CCNEG|CCAD><.*>*'
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*'
            ),  #, '<CM|COLON|DASH|SEMICOLON>','',''
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  # --> subordinate
            'last':
            RulesPattern(
                '<.*>*<V.*><.*>*<CQUE|CSUBF|CSUBI|CSUBX><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  #--> subordinate
        }

예제 #2

파일 보기

파일: TMSplit.py 프로젝트: MittagQI/nectm

    def __init__(self):  # language,#
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  # 'subordinate', --> Last = subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english') #stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  #TMUniversalPosTag('EN')
        self.rules = {
            #pattern ---> left ---> right
            'conjunction':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<CC><.*>*<V.*><.*>*'),  #, '<CC>','',''
            'last':
            RulesPattern('<.*>*<V.*><.*>*<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            # --> wh_words               <V> <NP|PP>*
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<\,|\;|\:|\-><.*>*<V.*><.*>*'),  #, '<\,|\;|\:|\->', '', ''
        }

예제 #3

파일 보기

  def _only_word_sequence(text, lang): # Receive original sequence
    only_word = []
    only_st = []
    l_src_st = TMUtilsMatching.check_stopwords(lang)
    for match in re.finditer(r'[a-zA-Z0-9\u4e00-\u9fff\u3040-\u309Fー\u30A0-\u30FF]+', text):  # Get all the words and numbers

      if l_src_st: # For some language we don't have stopwords list
        if match.group() in l_src_st:
          only_st.append(match.group())
        else:
          only_st.append('P')
          only_word.append(match.group())

    return only_word, only_st

예제 #4

파일 보기

파일: TMSplit.py 프로젝트: MittagQI/nectm

    def __init__(self):
        self.order = ['comma', 'last']  # last = conjuntion

        self.sw = TMUtilsMatching.check_stopwords('FR')  # stop words
        self.ut = TMTextProcessors.univ_pos_tagger(
            'FR')  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern('<.*>*<V.*><.*>*<KON><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<KON><.*>*<V.*><.*>*'),
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<PUN|SENT><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',  # Punctuation marks
                '<PUN|SENT><.*>*<V.*><.*>*')
        }

예제 #5

파일 보기

파일: TMSplit.py 프로젝트: MittagQI/nectm

    def __init__(self):  # language,#
        self.order = ['comma', 'last']  # 'conjunction',

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english')  # stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  # TMUniversalPosTag('EN')
        self.rules = {
            # pattern ---> left ---> right
            'comma':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'),
            'last':
            RulesPattern('<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'),
        }

예제 #6

파일 보기

파일: TMSplit.py 프로젝트: MittagQI/nectm

    def __init__(self, lang):
        self.order = ['comma', 'last']  # subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            lang)  # stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            lang)  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern(
                '<.*>*<VERB><.*>*<CONJ|SCONJ><.*>*<VERB><.*>*',  # conjunctions
                '<.*>*<VERB><.*>*',
                '<CONJ><.*>*<VERB><.*>*'),
            'comma':
            RulesPattern('<.*>*<VERB><.*>*<\.><.*>*<VERB><.*>*',
                         '<.*>*<VERB><.*>*', '<\.><.*>*<VERB><.*>*'),  # comma
        }