コード例 #1
0
    def __wordTokenize__(self):
        v_tokens = []
        for token_by_sent in [
                word_tokenize(sentence)
                for sentence in self.__sentTokenize__()
        ]:
            v_tokens += token_by_sent

        v_tokens = list(
            filter(lambda item: item not in g_StopWordsEnglish, v_tokens))
        v_tokens = list(filter(lambda item: item not in punctuation, v_tokens))
        v_tokens = list(
            filter(
                lambda item: item not in
                [u"...", u'``', u'\u2014', u'\u2026', u'\u2013'], v_tokens))

        self.__clean_tokens = []
        for token in v_tokens:
            token = WordNetLemmatizer().lemmatize(token)
            token = WordNetLemmatizer().lemmatize(token, pos='v')

            if not self.__filterMinSize is None:
                if len(token) < self.__filterMinSize:
                    continue

            if self.__usePorterStemmer:
                token = PorterStemmer().stem(token)

            self.__clean_tokens.append(token.strip())

        v_pos_tag = nltk.pos_tag(self.__clean_tokens)

        if not self.__includeTags is None:
            self.__clean_tokens = [
                item[0] for item in v_pos_tag if item[1] in self.__includeTags
            ]

        if not self.__excludeTags is None:
            self.__clean_tokens = [
                item[0] for item in v_pos_tag
                if item[1] not in self.__excludeTags
            ]

        self.__clean_document = ' '.join(self.__clean_tokens)

        self.__pos_tags = {}
        for item in nltk.pos_tag(self.__clean_tokens):
            v_key = item[1]
            if not v_key in self.__pos_tags.keys():
                self.__pos_tags[v_key] = []
            self.__pos_tags[v_key].append(item[0])

        self.__pos_tags = {
            key: sorted(set(value))
            for key, value in self.__pos_tags.items()
        }

        return self.__clean_document
コード例 #2
0
    def _fix_predicate_morphology(subject,
                                  predicate,
                                  complement,
                                  format='triple'):
        """
        Conjugation
        Parameters
        ----------
        subject
        predicate

        Returns
        -------

        """
        # TODO revise by Lenka
        new_predicate = ''
        if format == 'triple':
            if len(predicate.split()) > 1:
                for el in predicate.split():
                    if el == 'is':
                        new_predicate += 'be-'
                    else:
                        new_predicate += el + '-'

            elif predicate.endswith('s'):
                new_predicate = WordNetLemmatizer().lemmatize(predicate)

            else:
                new_predicate = predicate

        elif format == 'natural':
            if len(predicate.split()) > 1:
                for el in predicate.split():
                    if el == 'be':
                        new_predicate += 'is '
                    else:
                        new_predicate += el + ' '

            # elif predicate == wnl.lemmatize(predicate):
            #    new_predicate = predicate + 's' # TODO conjugate!

            else:
                new_predicate = predicate

        return new_predicate.strip(' ')
コード例 #3
0
    def _fix_predicate_morphology(subject, predicate, object, format='triple'):
        """
        Conjugation
        Parameters
        ----------
        subject
        predicate

        Returns
        -------

        """
        # TODO: Copied from language.utils.helper_functions, because of circular dependency issues...
        # TODO revise by Lenka
        new_predicate = ''
        if format == 'triple':
            if len(predicate.split()) > 1:
                for el in predicate.split():
                    if el == 'is':
                        new_predicate += 'be-'
                    else:
                        new_predicate += el + '-'

            elif predicate.endswith('s'):
                new_predicate = WordNetLemmatizer().lemmatize(predicate)

            else:
                new_predicate = predicate

        elif format == 'natural':
            if len(predicate.split()) > 1:
                for el in predicate.split():
                    if el == 'be':
                        new_predicate += 'is '
                    else:
                        new_predicate += el + ' '

            # elif predicate == wnl.lemmatize(predicate):
            #    new_predicate = predicate + 's'

            else:
                new_predicate = predicate

        return new_predicate.strip(' ')
コード例 #4
0
def messageTokenize(p_text):
    """ 
        Function messageTokenize does the transformation of a message into a tokenized message. It also returns a flag is the message starts / ends with a verb or a proper noun.
    """
    v_text = p_text

    v_url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    v_urls = re.findall(v_url_regex, v_text)

    for url in v_urls:
        v_text = v_text.replace(url, "urlplaceholder")

    sentence_list = nltk.sent_tokenize(v_text)
    v_first_verb = 0
    v_last_verb = 0
    v_first_nnp = 0
    v_last_nnp = 0
    v_nnp = 0
    for sentence in sentence_list:
        pos_tags = nltk.pos_tag(word_tokenize(sentence))

        if v_first_verb == 0:
            v_word, v_tag = pos_tags[0]
            if v_tag in ['VB', 'VBP', 'VBZ', 'VBG']:
                v_first_verb = 1

        if v_last_verb == 0:
            v_word, v_tag = pos_tags[-1]
            if v_tag in ['VB', 'VBP', 'VBZ', 'VBG']:
                v_last_verb = 1

        if v_first_nnp == 0:
            v_word, v_tag = pos_tags[0]
            if v_tag in ['NNP']:
                v_first_nnp = 1

        if v_last_nnp == 0:
            v_word, v_tag = pos_tags[-1]
            if v_tag in ['NNP']:
                v_last_nnp = 1

        if v_nnp == 0:
            for idx in range(len(pos_tags)):
                v_word, v_tag = pos_tags[idx]
                if v_tag in ['NNP']:
                    v_nnp = 1
                    break

    v_text = re.sub(r'[^a-zA-Z0-9]', ' ', v_text.lower())
    v_tokens = [
        item.strip() for item in word_tokenize(v_text)
        if item not in stopwords.words('english')
    ]

    v_clean_tokens = []
    for token in v_tokens:
        token = WordNetLemmatizer().lemmatize(token)
        token = WordNetLemmatizer().lemmatize(token, pos='v')
        token = PorterStemmer().stem(token)
        v_clean_tokens.append(token.strip())

    v_text = ' '.join(v_clean_tokens)

    return (v_text, v_first_verb, v_last_verb, v_first_nnp, v_last_nnp, v_nnp)