def text2sents(text, lemmatize=False, stemmer=None): """ converts a text into a list of sentences consisted of normalized words :param text: list of string to process :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False :return: list of lists of words """ sents = sent_tokenize(text) tokenizer = RegexpTokenizer(r'\w+') if lemmatize: normalizer = WordNetLemmatizer() tagger = PerceptronTagger() elif stemmer is None: normalizer = PorterStemmer() else: normalizer = stemmer sents_normalized = [] for sent in sents: sent_tokenized = tokenizer.tokenize(sent) if lemmatize: sent_tagged = tagger.tag(sent_tokenized) sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged] else: sent_normalized = [normalizer.stem(w) for w in sent_tokenized] sents_normalized.append(sent_normalized) return sents_normalized
def get_tagger(): """ Returns: """ return PerceptronTagger()
def pos_tag_reviews(records): print('%s: tag reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) tagger = PerceptronTagger() for record in records: tagged_words =\ nlp_utils.tag_words(record[Constants.TEXT_FIELD], tagger) record[Constants.POS_TAGS_FIELD] = tagged_words
def clean_text(text, stopwords, remove_stopwords=True, pos_filtering=False, stemming=True, lower_case=True): if lower_case: # convert to lower case text = text.lower() # strip extra white space text = re.sub(' +', ' ', text) # strip leading and trailing white space text = text.strip() # tokenize (split based on whitespace) tokens = text.split(' ') # remove punctuation tokens = [t for t in tokens if t not in string.punctuation] if pos_filtering: tagger = PerceptronTagger() # apply POS-tagging tagged_tokens = tagger.tag(tokens) # retain only nouns and adjectives tokens = [ item[0] for item in tagged_tokens if item[1] in [ 'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' ] ] if remove_stopwords: # remove stopwords tokens = [token for token in tokens if token.lower() not in stopwords] if stemming: stemmer = nltk.stem.PorterStemmer() # apply Porter's stemmer tokens_stemmed = list() for token in tokens: tokens_stemmed.append(stemmer.stem(token)) tokens = tokens_stemmed return (tokens)
class Syntax(Characteristic): """ Accounts for the syntactic aspects of the source text. Word-for-word: Parses and stores part-of-speech (POS) tags. Entire text: Enumerates all configurations of clause and sentence found in the text. """ POS_TAG = PerceptronTagger() def __init__(self, tokens): Characteristic.__init__(self) def pos_tag(self): pass def find_clauses(self): pass def find_sentences(self): pass
if domain == 'meeting': path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat' path_to_filler_words = path_to_root + 'resources/stopwords/meeting/filler_words.' + language + '.txt' stopwords = utils.load_stopwords(path_to_stopwords) filler_words = utils.load_filler_words(path_to_filler_words) if dataset_id == 'ami': ids = meeting_lists.ami_development_set + meeting_lists.ami_test_set elif dataset_id == 'icsi': ids = meeting_lists.icsi_development_set + meeting_lists.icsi_test_set if language == 'en': path_to_word2vec_keys = path_to_root + 'resources/word2vec_keys.txt' # tokenizer = DictionaryTokenizer(path_to_word2vec_keys) # highly time-consuming # tokenizer = TweetTokenizer() tagger = PerceptronTagger() # ###################### # ### CORPUS LOADING ### # ###################### corpus = {} for id in ids: if domain == 'meeting': if dataset_id == 'ami' or dataset_id == 'icsi': if source == 'asr': path = path_to_root + 'data/meeting/' + dataset_id + '/' + id + '.da-asr' elif source == 'manual': path = path_to_root + 'data/meeting/' + dataset_id + '/' + id + '.da' # filler words will be removed during corpus loading corpus[id] = utils.read_ami_icsi(path, filler_words)