コード例 #1
0
def preproc(xtrain, xtest):
    xtrain_1 = []
    xtest_1 = []

    stemmer = PorterStemmer()
    for X in xtrain:
        Y = str(X).replace('\n', '')

        X = WhitespaceTokenizer().tokenize(str(Y))
        X = re.sub(r'[^\w]', ' ', str(X))
        X = word_tokenize(str(X))
        stems = [stemmer.stem(token) for token in X]
        xtrain_1.append(str(stems))

    for X in xtest:
        Y = str(X).replace('\n', '')

        X = WhitespaceTokenizer().tokenize(str(Y))
        X = re.sub(r'[^\w]', ' ', str(X))
        X = word_tokenize(str(X))
        stems = [stemmer.stem(token) for token in X]
        xtest_1.append(str(stems))

    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.2)
    Xtrain = vectorizer.fit_transform(xtrain_1)
    Xtest = vectorizer.transform(xtest_1)
    return (Xtrain, Xtest)
コード例 #2
0
    def __init__(self, vocab_size=30522, pct_bpe=1, word_tokenizer=None,
                 silent=True, ngram_min=2, ngram_max=2, required_tokens=None,
                 strict=False, lowercase=True,
             EOW=DEFAULT_EOW, SOW=DEFAULT_SOW, UNK=DEFAULT_UNK, PAD=DEFAULT_PAD, MASK=DEFAULT_MASK, CLS=DEFAULT_CLS, SEP=DEFAULT_SEP):
        if vocab_size < 1:
            raise ValueError('vocab size must be greater than 0.')

        self.EOW = EOW
        self.SOW = SOW
        self.eow_len = len(EOW)
        self.sow_len = len(SOW)
        self.UNK = UNK
        self.PAD = PAD
        self.cls_token = CLS
        self.sep_token = SEP
        self.mask_token = MASK
        self.required_tokens = list(set(required_tokens or []).union({self.UNK, self.PAD, self.mask_token,self.cls_token,self.sep_token}))
        self.vocab_size = vocab_size
        self.pct_bpe = pct_bpe
        self.word_vocab_size = max([int(vocab_size * (1 - pct_bpe)), len(self.required_tokens or [])])
        self.bpe_vocab_size = vocab_size - self.word_vocab_size
        self.word_tokenizer = word_tokenizer if word_tokenizer is not None else WhitespaceTokenizer().tokenize #ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize #ucto_tokenize # WhitespaceTokenizer().tokenize #wordpunct_tokenize
        self.word_tokenizer_fitting = WhitespaceTokenizer().tokenize #ucto_tokenize#WhitespaceTokenizer().tokenize#ucto_tokenize#ucto_tokenize#WhitespaceTokenizer().tokenize #WhitespaceTokenizer().tokenize #wordpunct_tokenize
        self.custom_tokenizer = word_tokenizer is not None
        self.word_vocab = {}  # type: Dict[str, int]
        self.bpe_vocab = {}  # type: Dict[str, int]
        self.inverse_word_vocab = {}  # type: Dict[int, str]
        self.inverse_bpe_vocab = {}  # type: Dict[int, str]
        self._progress_bar = iter if silent else tqdm
        self.ngram_min = ngram_min
        self.ngram_max = ngram_max
        self.strict = strict
        self.lowercase = lowercase
コード例 #3
0
ファイル: models.py プロジェクト: luzhaoxin/mark2cure
    def resultwords(self, user_view, gm_view):
        # Gather words and positions from the text
        words_index = WhitespaceTokenizer().span_tokenize(self.text)
        words_text = WhitespaceTokenizer().tokenize(self.text)
        words = zip(words_index, words_text)

        # Add counters for concensus count and personal annotation
        # ((start, stop), 'Word string itself', Intensity, GM Ann ID, User Ann ID, Did user annotate)
        words = [w + (0, None, None, False,) for w in words]

        # Gather other annotations from GM and users for this section
        gm_anns = Annotation.objects.filter(view=gm_view).values_list('pk', 'start', 'text')

        # Build the running counter of times a word was annotated
        for gm_pk, start, text in gm_anns:
            length = len(text)

            for idx, word in enumerate(words):
                word_start = word[0][0]
                counter = word[2]
                if word_start >= start and word_start <= start + length:
                    counter += 1
                    words[idx] = (word[0], word[1], counter, gm_pk, word[3], word[4])

        user_anns = Annotation.objects.filter(view=user_view).values_list('pk', 'start', 'text')

        # Build the running counter of times a word was annotated
        for user_pk, start, text in user_anns:
            length = len(text)
            for idx, word in enumerate(words):
                word_start = word[0][0]
                if word_start >= start and word_start <= start + length:
                    words[idx] = (word[0], word[1], word[2], word[3], user_pk, True)

        return words
コード例 #4
0
    def My_tokenizer(
        self,
        sentence,
        charoff=False
    ):  #  I removed an erlier version of this tokenizer that contains a set of rules that could capture name entities. Now it's just an nltk tokenizer

        numbers = [str(i) for i in range(1000)]
        tokens = []
        char_o = []
        tokenized = word_tokenize(sentence)

        for i in tokenized:
            if i[-1] in string.punctuation:
                tokens.append(i[:-1])
            else:
                tokens.append(i)

        c_tokens = [
            token for token in tokens if not token in string.punctuation
            if not token in numbers
        ]
        l_tokens = " ".join([t + " " for t in c_tokens])
        b = WhitespaceTokenizer().tokenize(l_tokens)
        if charoff:
            if len(c_tokens) > 1:
                m = list(WhitespaceTokenizer().span_tokenize(l_tokens))
                char_o = list(zip(b, m))
            return char_o
        else:
            return b
コード例 #5
0
ファイル: utils.py プロジェクト: AmyOlex/Chrono
def getWhitespaceTokens(file_path):
    file = open(file_path, "r")
    raw_text = file.read()
    ## Testing the replacement of all "=" signs by spaces before tokenizing.
    text = raw_text.translate(str.maketrans("=", ' '))

    ## Tokenize the sentences
    sentences = sent_tokenize(text)

    ## Get spans of the sentences
    sent_spans = align_tokens(sentences, text)

    ## create empty arrays for white space tokens and sentence delimiters
    tokenized_text = []
    text_spans = []

    ## Loop through each sentence and get the tokens and token spans
    for s in range(0, len(sentences)):
        # get the tokens and token spans within the sentence
        toks = WhitespaceTokenizer().tokenize(sentences[s])
        span_generator = WhitespaceTokenizer().span_tokenize(sentences[s])
        rel_spans = [span for span in span_generator]

        # convert the relative spans into absolute spans
        abs_spans = []
        for start, end in rel_spans:
            abs_spans = abs_spans + [
                (sent_spans[s][0] + start, sent_spans[s][0] + end)
            ]

        tokenized_text = tokenized_text + toks
        text_spans = text_spans + abs_spans

    ## Now we have the token list and the spans.  We should be able to continue finding sentnence boundaries as before
    tags = nltk.pos_tag(tokenized_text)
    sent_boundaries = [0] * len(tokenized_text)

    ## figure out which tokens are at the end of a sentence
    tok_counter = 0

    for s in range(0, len(sentences)):
        sent = sentences[s]

        if "\n" in sent:
            sent_newline = sent.split("\n")
            for sn in sent_newline:
                sent_split = WhitespaceTokenizer().tokenize(sn)
                nw_idx = len(sent_split) + tok_counter - 1
                sent_boundaries[nw_idx] = 1
                tok_counter = tok_counter + len(sent_split)

        else:
            sent_split = WhitespaceTokenizer().tokenize(sent)
            nw_idx = len(sent_split) + tok_counter - 1
            sent_boundaries[nw_idx] = 1
            tok_counter = tok_counter + len(sent_split)

    return raw_text, text, tokenized_text, text_spans, tags, sent_boundaries
コード例 #6
0
def getWhitespaceTokens(file_path):
    file = open(file_path, "r")
    text = file.read()
    text = text.replace("\n", "\n\n")
    span_generator = WhitespaceTokenizer().span_tokenize(text)
    spans = [span for span in span_generator]
    tokenized_text = WhitespaceTokenizer().tokenize(text)
    tags = nltk.pos_tag(tokenized_text)
    return text, tokenized_text, spans, tags
コード例 #7
0
ファイル: utils.py プロジェクト: NLPatVCU/TACChrono
def getWhitespaceTokens(file_path):
    file = open(file_path, "r")
    text = file.read()
    ## Testing the replacement of all "=" signs by spaces before tokenizing.
    text = text.translate(str.maketrans("=", ' '))

    span_generator = WhitespaceTokenizer().span_tokenize(text)
    spans = [span for span in span_generator]
    tokenized_text = WhitespaceTokenizer().tokenize(text)
    tags = nltk.pos_tag(tokenized_text)
    #print(tokenized_text)

    sent_tokenize_list = sent_tokenize(text)
    sent_boundaries = [0] * len(tokenized_text)

    ## figure out which tokens are at the end of a sentence
    tok_counter = 0

    #print("\nLength of tokenized_text: " + str(len(tokenized_text)) + "\n")
    #print("Starting value of tok_counter: " + str(tok_counter))
    #print("Number of tokenized sentences: " + str(len(sent_tokenize_list)))

    for s in range(0, len(sent_tokenize_list)):
        sent = sent_tokenize_list[s]
        #print("Sentence #" + str(s) + "::::" + sent)

        if "\n" in sent:
            #print("Found Newline in Sentence #" + str(s))
            sent_newline = sent.split("\n")
            #print("Sentence #" + str(s) + " has " + str(len(sent_newline)) + " new lines.")
            for sn in sent_newline:
                sent_split = WhitespaceTokenizer().tokenize(sn)
                #print("Newline string :::: " + sn)
                #print("Length of newline string: " + str(len(sent_split)))
                nw_idx = len(sent_split) + tok_counter - 1
                #print("Absolute index of last token in newline string: " + str(len(sent_split)) + "+" + str(tok_counter) + "-1 = " + str(nw_idx))
                sent_boundaries[nw_idx] = 1
                #print("New sent_boundaries: " + str(sent_boundaries))
                tok_counter = tok_counter + len(sent_split)
                #print("Incremented tok_counter by " + str(len(sent_split)) + " to equal " + str(tok_counter))

        else:
            sent_split = WhitespaceTokenizer().tokenize(sent)
            #print("No new lines. tok_counter: " + str(tok_counter))
            #print("Length of sentence: " + str(len(sent_split)))
            #print("Tokenized sentence #" + str(s) + ":::: " + str(sent_split))
            nw_idx = len(sent_split) + tok_counter - 1
            #print("New idx: " + str(nw_idx))
            sent_boundaries[nw_idx] = 1
            #print("New sent_boundaries: " + str(sent_boundaries))
            tok_counter = tok_counter + len(sent_split)
            #print("Incremented tok_counter by " + str(len(sent_split)) + " to equal " + str(tok_counter))

    return text, tokenized_text, spans, tags, sent_boundaries
コード例 #8
0
def initialize_berita(judul, isi):
    return {
        'token_judul':
        WhitespaceTokenizer().tokenize(stem(preprocess(judul))),
        'isi':
        isi,
        'list_isi':
        sent_tokenize(isi),
        'token_isi': [
            WhitespaceTokenizer().tokenize(stem(preprocess(kalimat)))
            for kalimat in sent_tokenize(isi)
        ]
    }
コード例 #9
0
def fun_1_1_5():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.tokenize import regexp_tokenize
    tokenizer = RegexpTokenizer("[\w]+")
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    print "regexp_tokenizer:", regexp_tokenize(
        "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+")
    # 通过空格来执行切分
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    # 筛选以大写字母开头的单词
    sent = " She secured 90.56 % in class X \n. She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print "RegexpTokenizer:", capt.tokenize(sent)
    # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的
    from nltk.tokenize import BlanklineTokenizer
    print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent)
    # 字符串的切分可以通过空格、间隔、换行等来完成
    from nltk.tokenize import WhitespaceTokenizer
    print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent)
    # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其
    # 切分为字母与非字母字符
    from nltk.tokenize import WordPunctTokenizer
    print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent)
    # 使用 split()方法进行切分
    print "split():", sent.split()
    print "split(' '):", sent.split(' ')
    print "split('\n'):", sent.split('\n')
    # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分
    from nltk.tokenize import LineTokenizer
    print "LineTokenizer:", LineTokenizer().tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent)
    # SpaceTokenizer 与 sent.split('')方法的工作原理类似
    from nltk.tokenize import SpaceTokenizer
    print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent)
    # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符
    # 在语句中的位置和偏移量
    print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent))
    # 给定一个标识符的序列,则可以返回其跨度序列
    from nltk.tokenize.util import spans_to_relative
    print "位置和偏移:", list(
        spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))
    # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量:
    from nltk.tokenize.util import string_span_tokenize
    print "标识符序列:", list(string_span_tokenize(sent, " "))
コード例 #10
0
def get_file_sentence_length_stats():
    fen = codecs.open('file.en', 'rb', encoding='utf-8')
    fde = codecs.open('file.de', 'rb', encoding='utf-8')
    enlen = []
    for s in fen:
        enlen.append(len(WhitespaceTokenizer().tokenize(s)))
    delen = []
    for s in fde:
        delen.append(len(WhitespaceTokenizer().tokenize(s)))
    print 'English mean: ', np.mean(enlen)
    print 'English std: ', np.std(enlen)
    print 'German mean: ', np.mean(delen)
    print 'German std: ', np.std(delen)

    return True
コード例 #11
0
def get_phrase_length_stats():
    p = pickle.load(open('counts_phrase_lr_dl.p','rb'))
    de_len = []
    en_len = []

    for de_p,en_p in p.keys():
        de_len.append(len(WhitespaceTokenizer().tokenize(de_p)))
        en_len.append(len(WhitespaceTokenizer().tokenize(en_p)))

    print 'English phrase mean: ', np.mean(en_len)
    print 'English phrase std: ', np.std(en_len)
    print 'German phrase mean: ', np.mean(de_len)
    print 'German phrase std: ', np.std(de_len)

    return True
コード例 #12
0
def read_session(lines):
    """
    it takes a path to a transcription file and returns a dictionary that maps conversation id to a list of words.
        :param lines: <class '_io.TextIOWrapper'>

    remember:
    *v: non-Dutch words,  *n: new non-existing words, *s: street  words,
    *a: incomplete words, *u: distorted words, *x: unclear word,
    xxx: unclear utterances, vvv: non-Dutch sentences, ggg: sounds made by the speaker
    """
    lines_to_words = lines.read()
    lines_to_words = re.sub('[0-9]*\.[0-9]*\t', '',
                            lines_to_words)  # to remove timestamps
    lines_to_words = re.sub(
        '[A-Za-z]*\*[anuxANUX]{1}', '',
        lines_to_words)  # to remove words with *n, *a, *u, and *x
    lines_to_words = re.sub('[A-Za-z]*\*[etV]{1}', '',
                            lines_to_words)  # unknown notation
    lines_to_words = re.sub('[A-Za-z]*\*op', '', lines_to_words)  # a mistake?

    lines_to_words = lines_to_words.replace('start\tend\ttext\n', '').replace('.', '').replace('-', ' ')\
        .replace('?', '').replace('\n', ' ').replace('xxx', '').replace('ggg', '').replace('vvv', '')\
        .replace('*v', '').replace('*s', '')

    lines_to_words = re.sub('[A-Za-z]*\*', '',
                            lines_to_words)  # for words with missing notation

    # s = lines_to_words.translate({ord(c): None for c in string.punctuation if c != '*'})
    tk = WhitespaceTokenizer()
    words = tk.tokenize(lines_to_words)

    return words
コード例 #13
0
def clean_googlengram(line):
    """Removes speechtags from line specific to the googlengram module

    Param:
        line (unicode)
    Returns:
        line (unicode)
    """
    return_line = line.split("\t")[
        0]  # Get the ngram, remove year, counter, etc
    clean = []
    words = WhitespaceTokenizer().tokenize(return_line)
    for word in words:
        # in >1-grams transitions to specific tags are written as:
        # The_ADJ _NOUN_ (meaning from The there is a transition to a noun
        # We remove those
        if word[0] != '_' and word[-1] != '_':
            # Split the token and the tag based on the '_'
            token, tag = str2tuple(word, '_')
            # Punct will be added using rules.
            if len(token) > 1:
                if tag != 'PUNCT' or tag != '.' or tag != '':
                    clean.append(token)
            elif token not in punctuation:
                clean.append(token)
    return_line = ' '.join(clean)
    if return_line != line:
        return True, return_line
    else:
        return False, line
コード例 #14
0
def main(tweet):
    #tweet = input("enter tweet here: ")
    tk = WhitespaceTokenizer()
    words = tk.tokenize(tweet)
    words_with_pos = pos_tag(words)
    queries = formQueries(words_with_pos)
    return scrapeWebForEachQuery(queries)
def lemmatize_and_tag(text):
    '''Tokenize the text and lemmatize the word by its tag. Add possible high frequeny character name to a global list nnp_to_remove
    '''
    global nnp_to_remove
    lemmatizer = WordNetLemmatizer()
    words = []
    for w, p in pos_tag(WhitespaceTokenizer().tokenize(text)):
        w = w.strip('1234567890"' + string.punctuation).lower()
        if w.lower() in stop_word or not w:
            continue
        if p in pos_to_wornet_dict.keys():
            if p == 'NNP' and 'V' in pos_tag(
                [w.lower()]
            )[0][1]:  #Verbs at the beggining of a sentence is classified as NNP sometimes. Discard it.
                continue
            words.append(
                (lemmatizer.lemmatize(w, pos_to_wornet_dict[p]).lower(), p))
    # Find words with highest frequency. If they are NNP, add them to a list of words to remove, as those words are most likely character's name
    bow = Counter(words)
    nnp_to_remove.update({
        x[0][0]
        for x in sorted(bow.items(), key=lambda x: x[1], reverse=True)[:8]
        if x[0][1] == 'NNP'
    })
    return ' '.join([
        x for x in list(zip(*words))[0]
        if (x not in nnp_to_remove and len(x) > 1)
    ])
コード例 #16
0
    def __init__(self, name, config):
        """
        Initializes the component.

        :param name: Component name (read from configuration file).
        :type name: str

        :param config: Dictionary of parameters (read from the configuration ``.yaml`` file).
        :type config: :py:class:`ptp.configuration.ConfigInterface`

        """
        # Call constructors of parent classes.
        Component.__init__(self, name, SentenceTokenizer, config)

        # Read the actual configuration.
        self.mode_detokenize = config['detokenize']

        # Tokenizer.
        self.tokenizer = WhitespaceTokenizer()

        # Set key mappings.
        self.key_inputs = self.stream_keys["inputs"]
        self.key_outputs = self.stream_keys["outputs"]

        if self.mode_detokenize:
            # list of strings -> sentence.
            self.processor = self.detokenize_sample
        else:
            # sentence -> list of strings.
            self.processor = self.tokenize_sample
    def __init__(self,
                 filename: str,
                 concat_glove: bool,
                 glove_vectors: Vectors,
                 elmo_model: ELMoModel,
                 lowercase_sentences: bool = False,
                 tokenize_sentences: bool = True,
                 only_news: bool = False):

        assert os.path.splitext(filename)[
            1] == '.csv', 'Metaphor dataset file should be of type CSV'

        self.concat_glove = concat_glove
        self.glove_vectors = glove_vectors
        self.tokenizer = WhitespaceTokenizer()
        self.lowercase_sentences = lowercase_sentences
        self.tokenize_sentences = tokenize_sentences
        self.only_news = only_news
        self.elmo_model = elmo_model

        self._sentences, self._labels = self._parse_csv_file(filename)

        self.pos_weight = 1 / (
            sum([sum([label for label in labels])
                 for labels in self._labels]) /
            sum([sum([1 for label in labels]) for labels in self._labels]))

        self.elmo_filename = self._assert_elmo_vectors_file(
            filename, self._sentences)

        self._data_size = len(self._sentences)
コード例 #18
0
def N_gram_Tokenizer(pathToFiles, n):
    pathToFiles = pathToFiles
    stemmer = PorterStemmer()
    n = n  #n in n-gram
    ngrams = {}
    #check if the file has gold standard if not continue to next file
    for filename in os.listdir(pathToFiles):
        if filename not in filename_overlap:
            continue
        with open(os.path.join(pathToFiles, filename)) as currentFile:
            ngrams[filename] = {}
            tokens_in_window = []
            for line in currentFile:
                #Tokenize on whitespace
                tokens = WhitespaceTokenizer().tokenize(line)
                for token in tokens:
                    token = token.split("_")
                    token = token[0].lower()
                    token = stemmer.stem(token)
                    tokens_in_window.append(token)
                    if len(tokens_in_window) > n:
                        tokens_in_window = tokens_in_window[1:]
                    newNGram = ''
                    if len(tokens_in_window) == n:
                        for currentToken in tokens_in_window:
                            newNGram = '{} {}'.format(newNGram, currentToken)
                        newNGram = newNGram.strip()
                    if newNGram:
                        ngrams[filename][newNGram] = 0
    return ngrams
コード例 #19
0
def extract_tokens(row, lemmatize=True, use_tag=True):
    tokenizer = WhitespaceTokenizer()
    if lemmatize:  # reduce words to lemmas
        pattern = '[().*+,?!\'\";:]*'
        token_list = list()
        if use_tag:  # use POS tags to obtain more accurate lemmas
            pos_tags = PerceptronTagger().tag(tokenizer.tokenize(row['text']))
            lemmatizer_input = map(
                lambda x: (x[0], nltk_to_wordnet.get(x[1][0])), pos_tags)
            lemmatizer = WordNetLemmatizer()
            for word, tag in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    word = word.lower()
                    if tag is None:
                        tok = lemmatizer.lemmatize(word)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
                    else:
                        tok = lemmatizer.lemmatize(word, tag)
                        tok = re.sub(pattern, '', tok)
                        if not tok.isdigit():
                            token_list.append(tok)
        else:  # do not use a tagger if not specified and speed up computation
            lemmatizer_input = tokenizer.tokenize(row['text'])
            lemmatizer = WordNetLemmatizer()
            for word in lemmatizer_input:
                if word != 'urlLink' and 'http:' not in word:
                    tok = lemmatizer.lemmatize(word.lower())
                    tok = re.sub(pattern, '', tok)
                    if not tok.isdigit():
                        token_list.append(tok)
    else:  # simply tokenize based on whitespaces
        token_list = tokenizer.tokenize(row['text'])
    return token_list
コード例 #20
0
def tweet_clean(tweet):

    # Remove HTML special entities (e.g. &amp;)
    tweet_c1 = re.sub(r'\&\w*;', '', tweet)

    # Remove hyperlinks
    tweet_c2 = re.sub(r'https?:\/\/.*\/\w*', '', tweet_c1)

    # Remove punctuation
    tweet_c3 = re.sub(r'[' + punctuation2.replace('@', '') + ']+', ' ',
                      tweet_c2)

    # Conversion to lowercase
    tweet_c4 = tweet_c3.lower()

    # Remove emoticons
    tweet_c5 = emoji_pattern.sub(r'', tweet_c4)

    # Tokenize with WhitespaceTokenizer to handle hashtag
    tokens = WhitespaceTokenizer().tokenize(tweet_c5)

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('italian'))
    filt_words = [w for w in tokens if not w in stop_words]

    # stemming words (...)
    stemmer = SnowballStemmer("italian")
    stem_words = [stemmer.stem(w) for w in filt_words]
    return stem_words
コード例 #21
0
def tokenize(documents):
    tokenizer = WhitespaceTokenizer()

    def tokenize_doc(document):
        return tokenizer.tokenize(document)

    """
  Ingests content, converts to lowercase, removes special characters except for _, ?, and %,
  replaces dashes and hypens. Returns full or unique list of cleaned words in content.

  :param content: String of text to tokenize.
  :param unique: Boolean indicating whether to make the output list of words unique or not.
  :return: list of cleaned and tokenized input content.
  """

    if documents is None:
        return None

    documents = list(map(tokenize_doc, documents))

    # Return an occurrence matrix instead of a frequency matrix
    # if unique is True:
    #   # set() removes duplicates and returns a dict, convert back into list
    #   words = list(set(words))

    return documents
コード例 #22
0
ファイル: tokenizer.py プロジェクト: zhangAlwin/tokenquery
    def tokenize(self, text):
        """
           tokenize text into a list of Token objects

            :param text: text to be tokenized (might contains several sentences)
            :type text: str
            :return: List of Token objects
            :rtype: list(Token)
        """
        tokens = []

        if self.tokenizer_type == "SpaceTokenizer":
            operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
            operator = WhitespaceTokenizer()
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "PTBTokenizer":
            ptb_tokens = word_tokenize(text)
            counter = 0
            for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
                new_token = Token(counter, token, span[0], span[1])
                counter += 1
                tokens.append(new_token)

        return tokens
コード例 #23
0
def tokenize(text):
    """using nltk to case normalize, lemmatize, and tokenize text. 
    This function is used in the machine learning pipeline to, 
    vectorize and then apply TF-IDF to the text.
    
     Args:
        text (str): A disaster message. 
        
    Returns: 
        processed_tokens (list): list of cleaned tokens in the message.
        
    """
    # get tokens from text
    tokens= WhitespaceTokenizer().tokenize(text)
    lemmatizer= WordNetLemmatizer()
    
    # clean tokens
    processed_tokens=[]
    for token in tokens:
        token=lemmatizer.lemmatize(token).lower().strip('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
        token=re.sub(r'\[[^.,;:]]*\]','', token)
        
        # add token to compiled list if not empty
        if token !='':
            processed_tokens.append(token)
    return processed_tokens
コード例 #24
0
ファイル: poem.py プロジェクト: sagangwee/pocab-poetry
def generate_n_gram(corpus):
    # In this example I'm using a corpus from NLTK - Gutenburg Project
    # Sara Bryant - Stories to Tell to
    path = "./corpora/" + corpus
    reader = PlaintextCorpusReader(path, '.*\.txt', WhitespaceTokenizer())
    sentences = reader.sents()

    # Process text and collect reverse N-grams sentence by sentence
    # Do not do this word by word or you'll have incoherent N-grams that span sentences
    def processText(sentence):
        tokens = []
        for word in sentence:
            valid = True
            for c in word:
                if c in string.punctuation and c != "'":
                    valid = False
            if valid:
                tokens.append(word.lower())
        return tokens

    ngrams = []
    for sentence in sentences:
        tokens = processText(sentence)
        ngrams += reverseNgrams(tokens, 3)

    # print string.punctuation
    model = setupModel(ngrams)

    with open(corpus + '.json', 'w') as outfile:
        json.dump(model, outfile)
コード例 #25
0
def MRR(pathToFiles):
    pathToFiles = pathToFiles
    stemmer = PorterStemmer()
    ngrams = {}
    for filename in os.listdir(pathToFiles):
        with open(os.path.join(pathToFiles, filename),
                  encoding="utf8",
                  errors="ignore") as currentFile:
            lines = [line.strip('\n') for line in currentFile]
            ngrams[filename] = {}
            tokens_in_window = []
            for line in lines:
                #tokenize on whitespace
                tokens = WhitespaceTokenizer().tokenize(line)
                tokens_in_window = []
                for token in tokens:
                    token = token.split("_")
                    token = token[0].lower()
                    #stemming using Porter Stemmer
                    token = stemmer.stem(token)
                    if not token:
                        continue
                    tokens_in_window.append(token)
                newNGram = ''
                for currentToken in tokens_in_window:
                    newNGram = '{} {}'.format(newNGram, currentToken)
                newNGram = newNGram.strip()
                if newNGram:
                    ngrams[filename][newNGram] = 0
    return ngrams
コード例 #26
0
def preprocess_txt(doc):
    #lowercase
    doc = doc.lower() 
    #remove "{html}" strings
    doc = re.sub('\{html\}', '', doc)
    #remove html tags
    doc = BeautifulSoup(doc, 'html.parser').get_text()
    #remove all paths/urls/--keys
    pattern = re.compile(r'[/\-+\\+]')
    doc_split = [token for token in WhitespaceTokenizer().tokenize(doc) if not pattern.findall(token)]
    doc = " ".join(doc_split)

    #tokenize and remove stop words and punctuation symbols and spaces using spaCy
    #use lemmas
    doc_spacy = sp(doc)
    doc_tokenized_spacy = [token.lemma_ for token in doc_spacy
        if not token.is_stop and not token.is_punct and not token.is_space]

    #preprocessing additionaly with nltk give much better results
    doc_nltk = " ".join(doc_tokenized_spacy)
    #tokenize and remove stop words and punctuation symbols using nltk 
    #remove numerics
    doc_tokenized_spacy_nltk = [token for token in nltk.word_tokenize(doc_nltk)
        if token.isalpha()]
    
    return doc_tokenized_spacy_nltk 
コード例 #27
0
ファイル: model.py プロジェクト: luungoc2005/nlp-test
class VNTokenizer(nn.Module):

    def __init__(self, config):
        super(VNTokenizer, self).__init__()
        self.max_emb_words = config.get('max_emb_words')
        self.embedding_dim = config.get('embedding_dim', EMBEDDING_DIM)
        self.char_embedding_dim = config.get('char_embedding_dim', CHAR_EMBEDDING_DIM)
        self.hidden_dim = config.get('hidden_dim', 1200)
        self.num_layers = config.get('num_layers', 3)
        self.dropout_prob = config.get('dropout_prob', .2)
        self.is_cuda = is_cuda if is_cuda is not None else torch.cuda.is_available()

        self.word_encoder = to_gpu(BRNNWordEncoder(self.char_embedding_dim, rnn_type='LSTM'))
        self.dropout = nn.Dropout(self.dropout_prob))

        # 0: reserved index by Keras tokenizer
        # num_words + 1: index for oov token
        self.embedding = nn.Embedding(self.max_emb_words + 2, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim + self.char_embedding_dim,
                            self.hidden_dim // 2,
                            num_layers=self.num_layers,
                            bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(self.hidden_dim, 1)

        # Set tokenizer
        self.tokenizer = tokenizer

        self.tokenize_fn = WhitespaceTokenizer().tokenize
コード例 #28
0
def nltk_tokenizer(tweets):
    tokenizers = {"TreebankWordTokenizer": {"tokens": [TreebankWordTokenizer().tokenize(tweet) for tweet in tweets]},
                  "WordPunctTokenizer": {"tokens": [WordPunctTokenizer().tokenize(tweet) for tweet in tweets]},
                  "WhitespaceTokenizer": {"tokens": [WhitespaceTokenizer().tokenize(tweet) for tweet in tweets]},} 
    
    tokenizers = analyzer(tokenizers, tweets)
    return tokenizers
コード例 #29
0
ファイル: aligned.py プロジェクト: sahitpj/nltk
    def __init__(
        self,
        root,
        fileids,
        sep='/',
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding='latin1',
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
コード例 #30
0
ファイル: util.py プロジェクト: dannykliu/toxic-comments
def clean_text(text):
    """ Removes punctuation, capitalizations, numbers, stop words, and stems words"""
    ps = PorterStemmer()

    stop_words = set(stopwords.words('english'))

    text = text.lower()
    text = contractions.expandContractions(text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)  # remove punctuation
    text = re.sub('\s+', ' ', text)
    text = re.sub('\d+', ' ', text)  # remove numbers
    text = re.sub(
        r'(.)\1\1+', r'\1\1',
        text)  # letters repeated 3 or more times in a row are repeated twice
    text = re.sub(r'(ha)\1\1+', r'haha', text)
    text = re.sub(r'(lo)\1\1+', r'lol', text)
    text = text.strip(' ')

    # stem words
    tokenizer = WhitespaceTokenizer()
    tokenized_comment = tokenizer.tokenize(text)
    filtered_sentence = [w for w in tokenized_comment if not w in stop_words]
    stemmed_comment = [ps.stem(word) for word in filtered_sentence]
    text = " ".join(stemmed_comment)
    return text