Exemple #1
0
    def timexTagAndTokenizeText(self, altText=None):
        """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored"""
        if altText is not None:
            raw = altText
            altOutputStep1 = self.timexTagText(raw)
            altOutputStep2 = self.wordTokenizeText(altOutputStep1)
            time_tagged_and_tokenizedText = MWETokenizer(
                mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')],
                separator='').tokenize(altOutputStep2)

            return time_tagged_and_tokenizedText
        else:
            """Tag all temporal expressions with timex2 tags."""
            """Don't need to open file here, because it's opened in timexTagText()"""
            tagged = self.timexTagText()
            """Word-tokenize all text above"""
            word_tagged = self.wordTokenizeText(tagged)
        '''consolidate all broken apart Timex2 tags into single "words"'''
        if self.textList.get('timexTagAndTokenizeText') is None:
            self.textList['timexTagAndTokenizeText'] = [
                MWETokenizer(mwes=[('<', '/TIMEX2', '>'),
                                   ('<', 'TIMEX2', '>')],
                             separator='').tokenize(x) for x in word_tagged
            ]

        print self.textList.get('timexTagAndTokenizeText')
        return self.textList.get('timexTagAndTokenizeText')
Exemple #2
0
def tokenize(multi_word_queries, text):
    """Returns a list of words that make up the text.
    Params: {text: String}
    Returns: List
    """
    lower_case = text.lower()
    tokenizer = RegexpTokenizer(
        'not\s+very\s+[a-z]+|not\s+[a-z]+|no\s+[a-z]+|[a-z]+')
    result = tokenizer.tokenize(lower_case)
    multi_tokenizer = MWETokenizer([('working', 'out'), ('coffee', 'shops'),
                                    ('average', 'prices'), ('union', 'square'),
                                    ('real', 'estate'), ('ice', 'cream'),
                                    ('whole', 'foods'), ('co', 'op'),
                                    ('wall', 'street'), ('world', 'trade'),
                                    ('high', 'school'), ('dim', 'sum'),
                                    ('empire', 'state'), ('high', 'rise'),
                                    ('walk', 'ups')])
    if len(multi_word_queries) > 0:
        for tok in multi_word_queries:
            if (len(tok.split('_')) > 1):
                multi_tokenizer.add_mwe(tuple(tok.split('_')))
    #add neighborhood names
    for n in neighborhood_name_phrases:
        multi_tokenizer.add_mwe(tuple(n.split('_')))

    result2 = multi_tokenizer.tokenize(result)
    return result2
Exemple #3
0
    def __init__(self, locations):
        self.tokenizer = MWETokenizer()
        self.time_tagger = TimeTagger()
        for a in locations:
            self.tokenizer.add_mwe(a.split())
        # Rules defined
        self.specials = {
            "ACTIVITY": activities.union(["driving", "flight"]),
            "REGION": regions,
            "KEYWORD": [word for word in all_keywords if ' ' in word],
            "LOCATION": locations,
            "QUANTITY": ["at least", "more than", "less than", "at most",
                         "not more than", "a number of"],
            "IN": ["in front of", "called"],
            "NN": [phrase.replace("_", " ") for phrase in list(phrases.keys())],
            "SPACE": ["living room", "fastfood restaurant", "restaurant kitchen", "restaurant",
                      "dining hall", "food court", "butchers shop", "restaurant patio",
                      "coffee shop", "room", "hotel room", "kitchen", "office",
                      "airport", "salon"],
            "POSITION": ["side", "foreground", "background", "right", "left",
                         "image"],
            "TOBE": ["am", "is", "are", "be", "is being", "am being", "are being", "being"],
            "WAS": ["was", "were", "had been", "have been"],
            "TIMEPREP": ["prior to", "then", "earlier than", "later than", "sooner than"],
            "POSITION_PREP": ["near", "distance to"],

        }
        for tag in self.specials:
            for keyword in self.specials[tag]:
                if ' ' in keyword:
                    self.tokenizer.add_mwe(keyword.split())
Exemple #4
0
    def __init__(self,
                 ents=None,
                 tag2ent=None,
                 collocations=special_collocations,
                 appos=collocations.appos):
        self.__tokenizer = TweetTokenizer(reduce_len=True)
        self.__collocations = collocations
        self.__tknzr = MWETokenizer(self.__collocations)

        self.__lemm = WordNetLemmatizer()
        self.__nlp = spacy.load("en_core_web_sm")
        if ents is None:
            self.__ents = {}
            if tag2ent is not None:
                raise ValueError(
                    "ent2tag and ents should be None or not None both")
            self.__tag2ent = {}
        else:
            if tag2ent is None:
                raise ValueError(
                    "ent2tag and ents should be None or not None both")
            self.__ents = ents
            self.__tag2ent = tag2ent
        self.__appos = appos
        for a in appos:
            self.__appos[a] = '_'.join(self.__appos[a].split())

        self.__punctuation = punctuation + "“”‘’‚"
        self.__stop_symbols = '←↓→↑'
Exemple #5
0
def segment(text,
            userdict_filepath="userdict2.txt",
            stopwords_filepath='stopwords.txt'):
    import nltk
    stopwords = [
        line.strip().lower() for line in open(
            stopwords_filepath, 'r', encoding='utf-8').readlines()
    ]  #
    final_list = []
    temp_list = []
    with open(userdict_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            temp_list.append(line.strip(' ').strip('\n'))
    f.close()
    temp = []
    for line in temp_list:
        for li in line.lower().split(' '):
            if len(li) != 0:
                temp.append(li.strip('\t'))
        final_list.append(tuple(temp))
        temp.clear()

    userdict_list = final_list
    tokenizer = MWETokenizer(userdict_list, separator=' ')

    seg_list = tokenizer.tokenize(
        nltk.word_tokenize(remove_symbols(text).lower()))

    seg_list_without_stopwords = []

    for word in seg_list:
        if word not in stopwords:
            if word != '\t':
                seg_list_without_stopwords.append(word)
    return seg_list_without_stopwords
Exemple #6
0
 def get_context(self, query_str, text, k=10):
     if query_str in text:
         tokenizer = MWETokenizer()
         query_str_tokens = tuple(query_str.split())
         query_str_dashed = "_".join(query_str_tokens)
         tokenizer.add_mwe(query_str_tokens)
         text_token = tokenizer.tokenize(text.split())
         try:
             t_start = text_token.index(query_str_dashed)
         except:
             return None, None, None
         t_end = t_start + 1
         start_index = max(t_start - k, 0)
         end_index = min(t_end + k, len(text_token))
         text_token_query = text_token[start_index:t_start] + text_token[
             t_end + 1:end_index]
         context = " ".join(text_token_query)
         context_mention = text_token[start_index:t_start] + [
             query_str
         ] + text_token[t_end + 1:end_index]
         context_mention = " ".join(context_mention)
         return context, text_token_query, context_mention
     else:
         logging.info('error, query not in text')
         return None, None, None
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))
                token_lowercase = [x.lower() for x in tokens]
                tmp.append(token_lowercase)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
Exemple #8
0
def tokenization(docs):
    documents = {}

    for doc in docs:
        document_plain = docs[doc]
        document_plain = document_plain.replace("/", "").replace("-", " ")
        #re.sub(r'\([^)]*\)', '', document_plain)
        re.sub(r'\([0-9]*\)', '', document_plain)

        relevant_words = []
        mwetokenizer = MWETokenizer()
        document_ner = spacy_nlp(document_plain)

        for element in document_ner.ents:
            # don't consider numbers
            if element.label_ not in "CARDINAL":
                relevant_words.append(element)

        #for each relevant word, if whitespace is present, create a single token with all the words
        for word in relevant_words:
            token = str(word).split()
            if len(token) > 1:
                move_data = []
                for element in token:
                    move_data.append(element)
                tup = tuple(move_data)
                mwetokenizer.add_mwe(tup)

        document_tokenized = word_tokenize(document_plain)
        document_retokenized = mwetokenizer.tokenize(document_tokenized)

        documents[doc] = document_retokenized
    return documents
Exemple #9
0
 def __init__(self, filename):
     """initializes a LyricsCleaner object"""
     self._filename = filename
     self._tokenizer = MWETokenizer()
     for word in SIGNAL_WORDS:
         self._tokenizer.add_mwe(('[', word, ']'))
     self._stemmer = LancasterStemmer()
Exemple #10
0
def trim_bio(text):

    # keywords to return
    keywords = []

    # load from file after custom edit
    df_keyword = pd.read_csv(local_data + "data/keywords/df.csv")

    ## convert df to list
    important_words = df_keyword["Unnamed: 0"].tolist()

    ## format important words so that they can be registered to tokenizer
    important_words = [x.split() for x in important_words]

    # initialize tokenizer
    tokenizer = MWETokenizer()
    for iw in important_words:
        tokenizer.add_mwe([x for x in iw])  # add important words
        #tokenizer.add_mwe(iw)  # add important words

    # tokenize bio
    tokens = tokenizer.tokenize([word.lower() for word in text.split()])

    # find important words from tokens, append it to keyword
    for iw in important_words:
        iw_joined = "_".join(iw)
        if (iw_joined in tokens):
            keywords.append(iw_joined)

    return keywords
Exemple #11
0
def init_base_order_tokenizer():
    p = nltk.PorterStemmer()
    food_tokenizer = MWETokenizer()
    food_items = {}
    prices_items = {}
    image_items = {}
    cal_items = {}
    with open('sheet1.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            food_item = row['Menu Item'].replace(' ', '_').lower()
            price = float(row['Price'])
            image = row['Image']
            cal = float(row['Calories'])
            image_items[food_item] = image
            food_items[food_item] = 0
            prices_items[food_item] = price
            cal_items[food_item] = cal

            items_stem = [
                p.stem(i) for i in row['Menu Item'].lower().split(' ')
            ]
            if len(items_stem) > 1:
                food_tokenizer.add_mwe(tuple(items_stem))

    with open('mwe.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            items_stem = [
                p.stem(i) for i in row['Menu Item'].lower().split(' ')
            ]
            if len(items_stem) > 1:
                food_tokenizer.add_mwe(tuple(items_stem))

    return food_tokenizer, food_items, prices_items, cal_items, image_items
Exemple #12
0
def initialize_known_phrase_tokenization(phrases):
    from nltk.tokenize import MWETokenizer
    tokenizer = MWETokenizer()
    for phrase in phrases:
        if (phrase):
            phrase_as_list = phrase.replace("_", " ").split()
            tokenizer.add_mwe(phrase_as_list)
    return tokenizer
Exemple #13
0
def multiword_tokenizer(token_list, bigram_list):
    """
    Tokenize a list of unigram tokens into bigram tokens,
    given a list of bigrams.
    Bigrams are separated with "__"
    """
    mwetokenizer = MWETokenizer(bigram_list, separator="__")
    return mwetokenizer.tokenize(token_list)
    def _init_mwe_tokenizer(self):
        def multi_word_expressions():
            for entity in self.vocab:
                if entity.find(self._PHRASE_DELIMITER) != -1:
                    yield entity.split(self._PHRASE_DELIMITER)

        it = multi_word_expressions()
        self._mwe_tokenizer = MWETokenizer(it)
Exemple #15
0
    def fit(self, X, **fit_params):
        """
        Procedure to iteratively contract bigrams (up to max_collocation_iterations times)
        that score higher on the collocation_function than the min_collocation_score (and satisfy other
        criteria set out by the optional parameters).
        """
        self.tokenization_ = X
        n_tokens = sum([len(x) for x in X])
        for i in range(self.max_iterations):
            bigramer = BigramCollocationFinder.from_documents(
                self.tokenization_)

            if not self.ignored_tokens == None:
                ignore_fn = lambda w: w in self.ignored_tokens
                bigramer.apply_word_filter(ignore_fn)

            if not self.excluded_token_regex == None:
                exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex,
                                                     w) is not None)
                bigramer.apply_word_filter(exclude_fn)

            if not self.min_token_occurrences == None:
                minocc_fn = lambda w: bigramer.word_fd[
                    w] < self.min_token_occurrences
                bigramer.apply_word_filter(minocc_fn)

            if not self.max_token_occurrences == None:
                maxocc_fn = lambda w: bigramer.word_fd[
                    w] > self.max_token_occurrences
                bigramer.apply_word_filter(maxocc_fn)

            if not self.min_token_frequency == None:
                minfreq_fn = (lambda w: bigramer.word_fd[w] < self.
                              min_token_frequency * n_tokens)
                bigramer.apply_word_filter(minfreq_fn)

            if not self.max_token_frequency == None:
                maxfreq_fn = (lambda w: bigramer.word_fd[w] > self.
                              max_token_frequency * n_tokens)
                bigramer.apply_word_filter(maxfreq_fn)

            if not self.min_ngram_occurrences == None:
                bigramer.apply_freq_filter(self.min_ngram_occurrences)

            new_grams = list(
                bigramer.above_score(self.score_function, self.min_score))

            if len(new_grams) == 0:
                break

            self.mtes_.append(new_grams)

            contracter = MWETokenizer(new_grams)
            self.tokenization_ = tuple([
                tuple(contracter.tokenize(doc)) for doc in self.tokenization_
            ])

        return self
def tokenize_and_remove_punct(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    mtokenizer = MWETokenizer()
    mwe = mtokenizer.tokenize(text.split())
    words = []
    for t in mwe:
        if t.isalpha():
            words.append(t)
    return words
def phrase_eval(params):
    list_phrases, unigram_set, target_token, idf, agg_score, pid = params

    idf_list = [*idf]
    idf_set = set(idf_list)

    tokenizer = MWETokenizer(separator=' ')
    for e in unigram_set:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    phrases_score = {}
    for phrase in tqdm(list_phrases,
                       desc='phrase-eval-{}'.format(pid),
                       mininterval=10):
        score = 0
        tokens = nltk.word_tokenize(phrase)
        if not set(tokens).issubset(idf_set):
            continue
        nonstop_tokens = [token for token in tokens if token not in stop]
        if len(nonstop_tokens) / len(tokens) <= 0.5:
            continue
        raw_tokenized = tokenizer.tokenize(tokens)
        tokenized_set = set(raw_tokenized)
        keywords = tokenized_set.intersection(unigram_set)
        for token in keywords:
            score += agg_score[token]
        score /= (1 + np.log(len(nonstop_tokens)))

        vocab = set(target_token).union(set(tokens))
        vocab = list(vocab.intersection(idf_set))
        target_vec = [0] * len(vocab)
        phrase_vec = [0] * len(vocab)

        target_token_freq = dict(Counter(target_token))
        target_token_subset = list(set(vocab).intersection(set(target_token)))
        for token in target_token_subset:
            index = vocab.index(token)
            target_vec[index] = target_token_freq[token] / len(
                target_token) * idf[token]

        phrase_token_freq = dict(Counter(tokens))
        for token in tokens:
            index = vocab.index(token)
            phrase_vec[index] = phrase_token_freq[token] / len(
                tokens) * idf[token]

        tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec)

        phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}})

    rearrange = {}
    for k, v in phrases_score.items():
        rearrange.update({k: v['score']})
    top_10 = nlargest(10, rearrange, key=rearrange.get)

    return {key: phrases_score[key] for key in top_10}
Exemple #18
0
def LoadTokenizer():
    global tokenizer
    tokenizer = MWETokenizer(separator=' ')
    for spword in WordDict:
        if ' ' in spword:
            tupleword = tuple(spword.split(' '))
            tokenizer.add_mwe(tupleword)
        if ':' in spword:
            tupleword = tuple(re.split(r"(:)", spword))
            tokenizer.add_mwe(tupleword)
Exemple #19
0
    def form_mwe_tokenizer(self):
        mwes = []

        for _, entry in self.text_entries.items():
            term = entry["lemma"]
            splitted = term.split()
            if len(term.split()) > 1:
                mwes.append(tuple(splitted))

        return MWETokenizer(mwes=mwes, separator=" ")
Exemple #20
0
    def timexTagAndTokenizeText(self, altText=None):
        """Tags temporal expressions with nltk timex2, and tokenizes the resultant text.

        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
        
        Returns:
            tokenized text (nested list, by sentence): 
            ex. [['This', 'is', 'a', 'sentence', '.'],['And', 'maybe', 'another']]

        """
        """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored"""
        if altText is not None:
            raw = altText
            altOutputStep1 = self.timexTagText(raw)
            altOutputStep2 = self.wordTokenizeText(altOutputStep1)
            time_tagged_and_tokenizedText = MWETokenizer(
                mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')],
                separator='').tokenize(altOutputStep2)

            return time_tagged_and_tokenizedText
        else:
            """Tag all temporal expressions with timex2 tags."""
            """Don't need to open file here, because it's opened in timexTagText()"""
            tagged = self.timexTagText()
            """Word-tokenize all text above"""
            word_tagged = self.wordTokenizeText(tagged)
        '''consolidate all broken apart Timex2 tags into single "words"'''
        if Preprocessor.textList.get('timexTagAndTokenizeText') is None:
            nestedListOutput = [
                MWETokenizer(mwes=[('<', '/TIMEX2', '>'),
                                   ('<', 'TIMEX2', '>')],
                             separator='').tokenize(x) for x in word_tagged
            ]

            #We need to remove and change this line if we don't want flattened (one dimensional list). Read below comment.
            Preprocessor.textList['timexTagAndTokenizeText'] = [
                item for sublist in nestedListOutput for item in sublist
            ]
        """Currently, the output is a flattened list, we need to decide if we want to keep the sentence structure (making the output a list of lists.
        This throws off the AEExtractor and the SuspectExtractor, which need to then be fixed."""
        return Preprocessor.textList.get('timexTagAndTokenizeText')
Exemple #21
0
def multiword_tokenize(text, mwe):
    # Tokenizer分离缩略词,(“Don't” =>'Do', "n't") 表句子切分的“,” "." 单独成词
    protected_tuples = [word_tokenize(word) for word in mwe]
    protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
    tokenizer = MWETokenizer(protected_tuples)
    # Tokenize the text.
    tokenized_text = tokenizer.tokenize(word_tokenize(text))
    # Replace the underscored protected words with the original MWE
    for i, token in enumerate(tokenized_text):
        if token in protected_tuples_underscore:
            tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
    return tokenized_text
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        lmtzr = WordNetLemmatizer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stop_words = stopwords.words('english')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                # For each sentence in the sentences

                # Tokenize the sentence based on Regex and then using MWETokenizer
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))

                # Lower the case of all the tokens
                token_lowercase = [x.lower() for x in tokens]

                # Lemmatize the sentence. Find the POS tags and then lemmatize
                tokens_lowecase_tagged = nltk.pos_tag(token_lowercase)
                lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged]

                # Stem the sentence
                stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence]

                # Remove the stop words
                processed_sentence = [word for word in stemmed_sentence if word not in stop_words]

                tmp.append(processed_sentence)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
Exemple #23
0
def tokenizer_sent(dataset):
    tokenizer = MWETokenizer()
    aspect_tokenized = []
    sentence_tokenized = []
    for i in range(0, len(dataset.index)):
        aspect_split = tuple(dataset['aspect_term'][i].lower().split())
        res = tokenizer.add_mwe(aspect_split)
        aspect_tokenized.append(res)
    for j in range(0, len(dataset.index)):
        tok = nltk.pos_tag(
            tokenizer.tokenize(dataset['text'][i].lower().split()))
        sentence_tokenized.append(tok)
    def sentence_filter(self, sentence):  # 对句子进行初步的分词和清洗

        if self.language == 'chinese':

            import jieba.posseg as psg

            return psg.cut(sentence)  # 使用jieba的分词接口直接完成分词和清洗

        elif self.language == 'english':

            from nltk.tokenize import MWETokenizer  # 使用MWE分词器

            tokenizer = MWETokenizer(self.userdict)  # 添加自定义词组,以下划线'_'为词组连接

            nlp = spacy.load('en_core_web_sm')  # 生成spacy分词器

            # for word in self.userdict:    # spacy添加自定义词语,貌似无效
            #     lex = nlp.vocab[word]

            # 清洗标点符号
            quote_double_pattern = re.compile('“|”')
            quote_single_pattern = re.compile('‘|’')
            punc_pattern = re.compile(
                "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,")

            sentence = re.sub(quote_double_pattern, '"', sentence)
            sentence = re.sub(quote_single_pattern, "'", sentence)  # 考虑's和s'的情况,不能直接删掉
            sentence = re.sub(punc_pattern, ' ', sentence)

            # 使用nltk和spacy得到分词结果,使用pke则得到完整句子
            # return nlp(' '.join(sentence.split()))    # spacy
            return nlp(' '.join(tokenizer.tokenize(sentence.lower().split())))  # nltk + spacy: 先用nltk添加词组,再用spacy分词
            # return sentence    # pke

        elif self.language == 'japanese':

            mecab = MeCab.Tagger('')  # 使用mecab的分词器直接得到结果,暂时不能添加自定义词典,有些专有名词识别不出来(如: 比特/币)

            # 清洗标点符号
            punc_pattern = re.compile(
                "\xa0|\t|\n|\:|\;| — | - |\!|\@|\#|\$|\%|\^|\&|\*|\_|\?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|/|・|「|」|•|※")
            sentence = re.sub(punc_pattern, ' ', sentence)

            sentence = [
                (
                    chunk.split('\t')[0],
                    chunk.split('\t')[1].split(',')[0]
                )
                for chunk in mecab.parse(sentence).splitlines()[:-1]
            ]  # 根据词条结构获取词根和词型

            return sentence
Exemple #25
0
def parseWordsFromEntry(entry, vocab_cap=10000):
    '''Tokenizes an entry into a list of words.'''
    '''Calculates their indeces relative to their frequencies.'''
    unknown = "UNKNOWN_WORD"
    tokenizer = MWETokenizer()
    words = entry.split()
    #words = tokenizer.tokenize(entry.split())
    frequencies = findWordFrequencyDists(words)
    vocab = frequencies.most_common(vocab_cap - 1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown)
    word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
    return word_to_index
def text_process_group(mess):
    """
    1. Lower case the input
    2. Remove punctuation expect '-'
    3. Apply custom tokenizer
    4. Return column of clean text words"""
    mess.lower()
    regex = r"[^\P{P}-]+"
    new_mess = re.sub(regex, " ", mess, 0)
    tokenizer = MWETokenizer(all_list, separator=' ')
    token = tokenizer.tokenize(new_mess.lower().split())
    sw = [x for x in token if x not in stopwords.words('english')]
    return sw
Exemple #27
0
 def sentence_filter(self, sentence):
     tokenizer = MWETokenizer(self.userdict)  # 添加自定义词组,以下划线'_'为词组连接
     nlp = spacy.load('en_core_web_sm')  # 生成spacy分词器
     quote_double_pattern = re.compile('“|”')
     quote_single_pattern = re.compile('‘|’')
     punc_pattern = re.compile(
         "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,"
     )
     sentence = re.sub(quote_double_pattern, '"', sentence)
     sentence = re.sub(quote_single_pattern, "'",
                       sentence)  # 考虑's和s'的情况,不能直接删掉
     sentence = re.sub(punc_pattern, ' ', sentence)
     return nlp(' '.join(tokenizer.tokenize(
         sentence.lower().split())))  # nltk + spacy: 先用nltk添加词组,再用spacy分词
Exemple #28
0
def multi_word_tokenizer(relevant_words, text):
    mwetokenizer = MWETokenizer()

    #add tuples of words into multiword tokenizer
    for word in relevant_words:
        token = str(word).split()
        move_data=[]
        for element in token:
            move_data.append(element)
        tup = tuple(move_data)
        mwetokenizer.add_mwe(tup)

    #execute multitokenization
    return mwetokenizer.tokenize(text)
    def tokenize_sentence(self, string, max_sentence_len, with_labels=False):
        merger = MWETokenizer([('<', 'unk', '>')], separator = '')
        sentence = word_tokenize(string.strip())       # tokenize sentence
        sentence = merger.tokenize(sentence)         # merge <unk>
        if with_labels:
            sentence = sentence[1:]
        sentence = [token.lower() for token in sentence]
        sentence = sentence[:max_sentence_len - 2]   # cut sentence at max_sentence_length
        sentence = ['<sos>'] + sentence + ['<eos>']  # add start and end-of-sentence tags

        # pad the rest of the sentence
        padded_sentence = sentence.copy()
        padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence)))

        return sentence, padded_sentence
def create_tokenizer():
    with open('mwe-prep-ru-final.txt', 'r') as f:
        lines = f.read().split('\n')
    mwe_list = [tuple(line.split(' ')) for line in lines if 'lemma' not in line and line != '']
    
    with open('mwes-prep-en.html', 'r') as f:
        lines = f.read().split('\n')   
    mwe_list_en = []

    for line in lines:
        if '</b>:' in line:
            mwe_list_en.append(tuple(line.split('</b>: ')[1].split(' <td align=right>')[0].split(' ')))
    
    mwe_list.extend(mwe_list_en)
    return MWETokenizer(mwe_list)