예제 #1
0
파일: TMSplit.py 프로젝트: MittagQI/nectm
    def __init__(self):
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  #subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'ES'
        )  #TMTextProcessors.stop_words('spanish')#stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'ES')  #TMUniversalPosTag('ES')

        self.rules = {
            'conjunction':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD><.*>*<V.*><.*>*',  #, '<CC|CCNEG|CCAD>','',''
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD><.*>*<V.*><.*>*'
            ),  #'?!<V.*>', '<CC|CCNEG|CCAD><.*>*'
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*'
            ),  #, '<CM|COLON|DASH|SEMICOLON>','',''
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  # --> subordinate
            'last':
            RulesPattern(
                '<.*>*<V.*><.*>*<CQUE|CSUBF|CSUBI|CSUBX><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  #--> subordinate
        }
예제 #2
0
파일: TMSplit.py 프로젝트: MittagQI/nectm
    def __init__(self):  # language,#
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  # 'subordinate', --> Last = subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english') #stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  #TMUniversalPosTag('EN')
        self.rules = {
            #pattern ---> left ---> right
            'conjunction':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<CC><.*>*<V.*><.*>*'),  #, '<CC>','',''
            'last':
            RulesPattern('<.*>*<V.*><.*>*<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            # --> wh_words               <V> <NP|PP>*
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<\,|\;|\:|\-><.*>*<V.*><.*>*'),  #, '<\,|\;|\:|\->', '', ''
        }
예제 #3
0
  def _only_word_sequence(text, lang): # Receive original sequence
    only_word = []
    only_st = []
    l_src_st = TMUtilsMatching.check_stopwords(lang)
    for match in re.finditer(r'[a-zA-Z0-9\u4e00-\u9fff\u3040-\u309Fー\u30A0-\u30FF]+', text):  # Get all the words and numbers

      if l_src_st: # For some language we don't have stopwords list
        if match.group() in l_src_st:
          only_st.append(match.group())
        else:
          only_st.append('P')
          only_word.append(match.group())

    return only_word, only_st
예제 #4
0
파일: TMSplit.py 프로젝트: MittagQI/nectm
    def __init__(self):
        self.order = ['comma', 'last']  # last = conjuntion

        self.sw = TMUtilsMatching.check_stopwords('FR')  # stop words
        self.ut = TMTextProcessors.univ_pos_tagger(
            'FR')  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern('<.*>*<V.*><.*>*<KON><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<KON><.*>*<V.*><.*>*'),
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<PUN|SENT><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',  # Punctuation marks
                '<PUN|SENT><.*>*<V.*><.*>*')
        }
예제 #5
0
파일: TMSplit.py 프로젝트: MittagQI/nectm
    def __init__(self):  # language,#
        self.order = ['comma', 'last']  # 'conjunction',

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english')  # stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  # TMUniversalPosTag('EN')
        self.rules = {
            # pattern ---> left ---> right
            'comma':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'),
            'last':
            RulesPattern('<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'),
        }
예제 #6
0
파일: TMSplit.py 프로젝트: MittagQI/nectm
    def __init__(self, lang):
        self.order = ['comma', 'last']  # subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            lang)  # stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            lang)  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern(
                '<.*>*<VERB><.*>*<CONJ|SCONJ><.*>*<VERB><.*>*',  # conjunctions
                '<.*>*<VERB><.*>*',
                '<CONJ><.*>*<VERB><.*>*'),
            'comma':
            RulesPattern('<.*>*<VERB><.*>*<\.><.*>*<VERB><.*>*',
                         '<.*>*<VERB><.*>*', '<\.><.*>*<VERB><.*>*'),  # comma
        }