def __wordTokenize__(self): v_tokens = [] for token_by_sent in [ word_tokenize(sentence) for sentence in self.__sentTokenize__() ]: v_tokens += token_by_sent v_tokens = list( filter(lambda item: item not in g_StopWordsEnglish, v_tokens)) v_tokens = list(filter(lambda item: item not in punctuation, v_tokens)) v_tokens = list( filter( lambda item: item not in [u"...", u'``', u'\u2014', u'\u2026', u'\u2013'], v_tokens)) self.__clean_tokens = [] for token in v_tokens: token = WordNetLemmatizer().lemmatize(token) token = WordNetLemmatizer().lemmatize(token, pos='v') if not self.__filterMinSize is None: if len(token) < self.__filterMinSize: continue if self.__usePorterStemmer: token = PorterStemmer().stem(token) self.__clean_tokens.append(token.strip()) v_pos_tag = nltk.pos_tag(self.__clean_tokens) if not self.__includeTags is None: self.__clean_tokens = [ item[0] for item in v_pos_tag if item[1] in self.__includeTags ] if not self.__excludeTags is None: self.__clean_tokens = [ item[0] for item in v_pos_tag if item[1] not in self.__excludeTags ] self.__clean_document = ' '.join(self.__clean_tokens) self.__pos_tags = {} for item in nltk.pos_tag(self.__clean_tokens): v_key = item[1] if not v_key in self.__pos_tags.keys(): self.__pos_tags[v_key] = [] self.__pos_tags[v_key].append(item[0]) self.__pos_tags = { key: sorted(set(value)) for key, value in self.__pos_tags.items() } return self.__clean_document
def _fix_predicate_morphology(subject, predicate, complement, format='triple'): """ Conjugation Parameters ---------- subject predicate Returns ------- """ # TODO revise by Lenka new_predicate = '' if format == 'triple': if len(predicate.split()) > 1: for el in predicate.split(): if el == 'is': new_predicate += 'be-' else: new_predicate += el + '-' elif predicate.endswith('s'): new_predicate = WordNetLemmatizer().lemmatize(predicate) else: new_predicate = predicate elif format == 'natural': if len(predicate.split()) > 1: for el in predicate.split(): if el == 'be': new_predicate += 'is ' else: new_predicate += el + ' ' # elif predicate == wnl.lemmatize(predicate): # new_predicate = predicate + 's' # TODO conjugate! else: new_predicate = predicate return new_predicate.strip(' ')
def _fix_predicate_morphology(subject, predicate, object, format='triple'): """ Conjugation Parameters ---------- subject predicate Returns ------- """ # TODO: Copied from language.utils.helper_functions, because of circular dependency issues... # TODO revise by Lenka new_predicate = '' if format == 'triple': if len(predicate.split()) > 1: for el in predicate.split(): if el == 'is': new_predicate += 'be-' else: new_predicate += el + '-' elif predicate.endswith('s'): new_predicate = WordNetLemmatizer().lemmatize(predicate) else: new_predicate = predicate elif format == 'natural': if len(predicate.split()) > 1: for el in predicate.split(): if el == 'be': new_predicate += 'is ' else: new_predicate += el + ' ' # elif predicate == wnl.lemmatize(predicate): # new_predicate = predicate + 's' else: new_predicate = predicate return new_predicate.strip(' ')
def messageTokenize(p_text): """ Function messageTokenize does the transformation of a message into a tokenized message. It also returns a flag is the message starts / ends with a verb or a proper noun. """ v_text = p_text v_url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' v_urls = re.findall(v_url_regex, v_text) for url in v_urls: v_text = v_text.replace(url, "urlplaceholder") sentence_list = nltk.sent_tokenize(v_text) v_first_verb = 0 v_last_verb = 0 v_first_nnp = 0 v_last_nnp = 0 v_nnp = 0 for sentence in sentence_list: pos_tags = nltk.pos_tag(word_tokenize(sentence)) if v_first_verb == 0: v_word, v_tag = pos_tags[0] if v_tag in ['VB', 'VBP', 'VBZ', 'VBG']: v_first_verb = 1 if v_last_verb == 0: v_word, v_tag = pos_tags[-1] if v_tag in ['VB', 'VBP', 'VBZ', 'VBG']: v_last_verb = 1 if v_first_nnp == 0: v_word, v_tag = pos_tags[0] if v_tag in ['NNP']: v_first_nnp = 1 if v_last_nnp == 0: v_word, v_tag = pos_tags[-1] if v_tag in ['NNP']: v_last_nnp = 1 if v_nnp == 0: for idx in range(len(pos_tags)): v_word, v_tag = pos_tags[idx] if v_tag in ['NNP']: v_nnp = 1 break v_text = re.sub(r'[^a-zA-Z0-9]', ' ', v_text.lower()) v_tokens = [ item.strip() for item in word_tokenize(v_text) if item not in stopwords.words('english') ] v_clean_tokens = [] for token in v_tokens: token = WordNetLemmatizer().lemmatize(token) token = WordNetLemmatizer().lemmatize(token, pos='v') token = PorterStemmer().stem(token) v_clean_tokens.append(token.strip()) v_text = ' '.join(v_clean_tokens) return (v_text, v_first_verb, v_last_verb, v_first_nnp, v_last_nnp, v_nnp)