Esempio n. 1
0
def clean_text(dirty_text, external_vocab_filename=None, external_vocab_level="no"):
    """(str, str, str) -> str

    external_vocab_level can be: no, with_ngrams, only.
    if you choose with_ngrams or only, you need to add an external_vocab_filename

    The text is cleaned in the following way:
    # substitute word1.word2 by word1. word2
    # split text into rough sentences based on '<\\%>'.  This symbol was added to denote
      a new line on the original product description
    # split the rough sentences using a sentence tokenizer from nltk
    # if zappos_ngrams is True, combine the zappos ngrams into one token. e.g., short sleeves -> short_sleeves

    # concatenate all tokenized words into one string and return string.

    An excerpt of text from the item looks like this:
    Sheath/Column One-Shoulder Short/Mini Bridesmaid Dress With Lace<\\%>SKU#:1020234<\\%>New Style Cocktail Dress<\\%>
    Color:The color as picture<\\%>Category:Bridesmaid Dress<\\%> Brand:Biggoldapple<\\%>
    Silhouette:Sheath/Column<\\%> Neckline:One-Shoulder<\\%> Hemline/Train:Short/Mini<\\%>

    """

    external_vocab_words = []
    if external_vocab_filename is not None:
        external_vocab_words = load_vocab_words(external_vocab_filename)

        # transform ngrams into tuples
        external_vocab_with_tuples = [tuple(z.split('_')) for z in external_vocab_words]  # assume that ngrams are separated by underscore: word1_word2.
        # multiple word tokenizer, more info: http://www.nltk.org/api/nltk.tokenize.html
        tokenizer_mwe = MWETokenizer(external_vocab_with_tuples)  # all external_vocab_words are added

    out_clean_text = ''
    # substitute word1.word2 by word1. word2
    dirty_text = re.sub(r"(\w[A-Z]|[a-z.])\.([^.)\s])", r"\1. \2", dirty_text)
    rough_sentences = dirty_text.split('<\\%>')  # sentences based on splitting by '<\\%>'

    sentences = []
    for rs in rough_sentences:
        rs = rs.replace("3/4", "3_4")  # just to keep the 3/4 as 3_4
        sentences.extend(SENT_TOKENIZER.tokenize(rs))  # sentences based on NLTK tokenizer

    for sentence in sentences:
        words = WORD_TOKENIZER.tokenize(sentence.lower())  # tokenize based on words. ignore that zappos vocabulary exists

        if external_vocab_level == 'with_ngrams':
            # keep all words (even those not in zappos), but group zappos ngrams into one token
            words = tokenizer_mwe.tokenize(words)  # group zappos_ngrams into one token.
        elif external_vocab_level == 'only':
            words = tokenizer_mwe.tokenize(words)  # group zappos_ngrams into one token.
            words = [w for w in words if w in external_vocab_words]  # only keep words in the zappos vocabulary

        words = [w for w in words if (not w.isdigit() or w == '3_4')]  # remove words that are just digits, but leave 3_4

        words_concat = ' '.join(words) + '\n'
        out_clean_text += words_concat

    return out_clean_text
Esempio n. 2
0
    def fit(self, X, **fit_params):
        """
        Procedure to iteratively contract bigrams (up to max_collocation_iterations times)
        that score higher on the collocation_function than the min_collocation_score (and satisfy other
        criteria set out by the optional parameters).
        """
        self.tokenization_ = X
        n_tokens = sum([len(x) for x in X])
        for i in range(self.max_iterations):
            bigramer = BigramCollocationFinder.from_documents(
                self.tokenization_)

            if not self.ignored_tokens == None:
                ignore_fn = lambda w: w in self.ignored_tokens
                bigramer.apply_word_filter(ignore_fn)

            if not self.excluded_token_regex == None:
                exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex,
                                                     w) is not None)
                bigramer.apply_word_filter(exclude_fn)

            if not self.min_token_occurrences == None:
                minocc_fn = lambda w: bigramer.word_fd[
                    w] < self.min_token_occurrences
                bigramer.apply_word_filter(minocc_fn)

            if not self.max_token_occurrences == None:
                maxocc_fn = lambda w: bigramer.word_fd[
                    w] > self.max_token_occurrences
                bigramer.apply_word_filter(maxocc_fn)

            if not self.min_token_frequency == None:
                minfreq_fn = (lambda w: bigramer.word_fd[w] < self.
                              min_token_frequency * n_tokens)
                bigramer.apply_word_filter(minfreq_fn)

            if not self.max_token_frequency == None:
                maxfreq_fn = (lambda w: bigramer.word_fd[w] > self.
                              max_token_frequency * n_tokens)
                bigramer.apply_word_filter(maxfreq_fn)

            if not self.min_ngram_occurrences == None:
                bigramer.apply_freq_filter(self.min_ngram_occurrences)

            new_grams = list(
                bigramer.above_score(self.score_function, self.min_score))

            if len(new_grams) == 0:
                break

            self.mtes_.append(new_grams)

            contracter = MWETokenizer(new_grams)
            self.tokenization_ = tuple([
                tuple(contracter.tokenize(doc)) for doc in self.tokenization_
            ])

        return self
def tokenize_and_remove_punct(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    mtokenizer = MWETokenizer()
    mwe = mtokenizer.tokenize(text.split())
    words = []
    for t in mwe:
        if t.isalpha():
            words.append(t)
    return words
Esempio n. 4
0
def phrase_eval(params):
    list_phrases, unigram_set, target_token, idf, agg_score, pid = params

    idf_list = [*idf]
    idf_set = set(idf_list)

    tokenizer = MWETokenizer(separator=' ')
    for e in unigram_set:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    phrases_score = {}
    for phrase in tqdm(list_phrases,
                       desc='phrase-eval-{}'.format(pid),
                       mininterval=10):
        score = 0
        tokens = nltk.word_tokenize(phrase)
        if not set(tokens).issubset(idf_set):
            continue
        nonstop_tokens = [token for token in tokens if token not in stop]
        if len(nonstop_tokens) / len(tokens) <= 0.5:
            continue
        raw_tokenized = tokenizer.tokenize(tokens)
        tokenized_set = set(raw_tokenized)
        keywords = tokenized_set.intersection(unigram_set)
        for token in keywords:
            score += agg_score[token]
        score /= (1 + np.log(len(nonstop_tokens)))

        vocab = set(target_token).union(set(tokens))
        vocab = list(vocab.intersection(idf_set))
        target_vec = [0] * len(vocab)
        phrase_vec = [0] * len(vocab)

        target_token_freq = dict(Counter(target_token))
        target_token_subset = list(set(vocab).intersection(set(target_token)))
        for token in target_token_subset:
            index = vocab.index(token)
            target_vec[index] = target_token_freq[token] / len(
                target_token) * idf[token]

        phrase_token_freq = dict(Counter(tokens))
        for token in tokens:
            index = vocab.index(token)
            phrase_vec[index] = phrase_token_freq[token] / len(
                tokens) * idf[token]

        tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec)

        phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}})

    rearrange = {}
    for k, v in phrases_score.items():
        rearrange.update({k: v['score']})
    top_10 = nlargest(10, rearrange, key=rearrange.get)

    return {key: phrases_score[key] for key in top_10}
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        lmtzr = WordNetLemmatizer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stop_words = stopwords.words('english')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                # For each sentence in the sentences

                # Tokenize the sentence based on Regex and then using MWETokenizer
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))

                # Lower the case of all the tokens
                token_lowercase = [x.lower() for x in tokens]

                # Lemmatize the sentence. Find the POS tags and then lemmatize
                tokens_lowecase_tagged = nltk.pos_tag(token_lowercase)
                lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged]

                # Stem the sentence
                stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence]

                # Remove the stop words
                processed_sentence = [word for word in stemmed_sentence if word not in stop_words]

                tmp.append(processed_sentence)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
Esempio n. 6
0
def tokenizer_sent(dataset):
    tokenizer = MWETokenizer()
    aspect_tokenized = []
    sentence_tokenized = []
    for i in range(0, len(dataset.index)):
        aspect_split = tuple(dataset['aspect_term'][i].lower().split())
        res = tokenizer.add_mwe(aspect_split)
        aspect_tokenized.append(res)
    for j in range(0, len(dataset.index)):
        tok = nltk.pos_tag(
            tokenizer.tokenize(dataset['text'][i].lower().split()))
        sentence_tokenized.append(tok)
Esempio n. 7
0
def multiword_tokenize(text, mwe):
    # Tokenizer分离缩略词,(“Don't” =>'Do', "n't") 表句子切分的“,” "." 单独成词
    protected_tuples = [word_tokenize(word) for word in mwe]
    protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
    tokenizer = MWETokenizer(protected_tuples)
    # Tokenize the text.
    tokenized_text = tokenizer.tokenize(word_tokenize(text))
    # Replace the underscored protected words with the original MWE
    for i, token in enumerate(tokenized_text):
        if token in protected_tuples_underscore:
            tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
    return tokenized_text
Esempio n. 8
0
    def sentence_filter(self, sentence):  # 对句子进行初步的分词和清洗

        if self.language == 'chinese':

            import jieba.posseg as psg

            return psg.cut(sentence)  # 使用jieba的分词接口直接完成分词和清洗

        elif self.language == 'english':

            from nltk.tokenize import MWETokenizer  # 使用MWE分词器

            tokenizer = MWETokenizer(self.userdict)  # 添加自定义词组,以下划线'_'为词组连接

            nlp = spacy.load('en_core_web_sm')  # 生成spacy分词器

            # for word in self.userdict:    # spacy添加自定义词语,貌似无效
            #     lex = nlp.vocab[word]

            # 清洗标点符号
            quote_double_pattern = re.compile('“|”')
            quote_single_pattern = re.compile('‘|’')
            punc_pattern = re.compile(
                "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,")

            sentence = re.sub(quote_double_pattern, '"', sentence)
            sentence = re.sub(quote_single_pattern, "'", sentence)  # 考虑's和s'的情况,不能直接删掉
            sentence = re.sub(punc_pattern, ' ', sentence)

            # 使用nltk和spacy得到分词结果,使用pke则得到完整句子
            # return nlp(' '.join(sentence.split()))    # spacy
            return nlp(' '.join(tokenizer.tokenize(sentence.lower().split())))  # nltk + spacy: 先用nltk添加词组,再用spacy分词
            # return sentence    # pke

        elif self.language == 'japanese':

            mecab = MeCab.Tagger('')  # 使用mecab的分词器直接得到结果,暂时不能添加自定义词典,有些专有名词识别不出来(如: 比特/币)

            # 清洗标点符号
            punc_pattern = re.compile(
                "\xa0|\t|\n|\:|\;| — | - |\!|\@|\#|\$|\%|\^|\&|\*|\_|\?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|/|・|「|」|•|※")
            sentence = re.sub(punc_pattern, ' ', sentence)

            sentence = [
                (
                    chunk.split('\t')[0],
                    chunk.split('\t')[1].split(',')[0]
                )
                for chunk in mecab.parse(sentence).splitlines()[:-1]
            ]  # 根据词条结构获取词根和词型

            return sentence
def text_process_group(mess):
    """
    1. Lower case the input
    2. Remove punctuation expect '-'
    3. Apply custom tokenizer
    4. Return column of clean text words"""
    mess.lower()
    regex = r"[^\P{P}-]+"
    new_mess = re.sub(regex, " ", mess, 0)
    tokenizer = MWETokenizer(all_list, separator=' ')
    token = tokenizer.tokenize(new_mess.lower().split())
    sw = [x for x in token if x not in stopwords.words('english')]
    return sw
Esempio n. 10
0
class AllKeywordsMatcher:
    def __init__(self, keywords: Set[str]):
        keywords_tuples = [tuple(k.split()) for k in keywords]
        self.keywords = keywords
        self._mwe_tokenizer = MWETokenizer(keywords_tuples, separator=" ")
        self._punc_regex = re.compile(r"[^\w\s]")

    def all_occurring_keywords(self, text: str) -> List[str]:
        text_without_punc = self._punc_regex.sub("", text)
        queried_text = self._mwe_tokenizer.tokenize(text_without_punc.split())
        found_words = [word for word in queried_text if word in self.keywords]

        return found_words
Esempio n. 11
0
def multi_word_tokenizer(relevant_words, text):
    mwetokenizer = MWETokenizer()

    #add tuples of words into multiword tokenizer
    for word in relevant_words:
        token = str(word).split()
        move_data=[]
        for element in token:
            move_data.append(element)
        tup = tuple(move_data)
        mwetokenizer.add_mwe(tup)

    #execute multitokenization
    return mwetokenizer.tokenize(text)
Esempio n. 12
0
 def sentence_filter(self, sentence):
     tokenizer = MWETokenizer(self.userdict)  # 添加自定义词组,以下划线'_'为词组连接
     nlp = spacy.load('en_core_web_sm')  # 生成spacy分词器
     quote_double_pattern = re.compile('“|”')
     quote_single_pattern = re.compile('‘|’')
     punc_pattern = re.compile(
         "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,"
     )
     sentence = re.sub(quote_double_pattern, '"', sentence)
     sentence = re.sub(quote_single_pattern, "'",
                       sentence)  # 考虑's和s'的情况,不能直接删掉
     sentence = re.sub(punc_pattern, ' ', sentence)
     return nlp(' '.join(tokenizer.tokenize(
         sentence.lower().split())))  # nltk + spacy: 先用nltk添加词组,再用spacy分词
Esempio n. 13
0
def search_term(term):
    #i = 0
    for tweet in tweets:
        #if i == 10:
        #	break
        #print(tweet['text'])
        tt = tweet['text'].lower()
        tm = term.lower()
        if ' ' in term:  # MultiToken
            tokenize_term = word_tokenize(tm)
            tokenizer = MWETokenizer('', separator=" ")
            before_tokens = len(tokenizer.tokenize(tt.split()))
            tokenizer = MWETokenizer([tokenize_term], separator=" ")
            after_tokens = len(tokenizer.tokenize(tt.split()))
            if after_tokens < before_tokens:
                sheet['H' + str(i + 1)] = sheet['H' + str(i + 1)].value + 1
                #book.save('tfreq-gl-filter.xlsx')
                #exit()
                #create_frequency_matrix(tm)
                #print('#MultiToken:', tm)
                #print(tokenizer.tokenize(tt.split()))
        else:  # Token
            if tm in word_tokenize(tt):
                sheet['H' + str(i + 1)] = sheet['H' + str(i + 1)].value + 1
Esempio n. 14
0
    def tokenize_sentence(self, string, max_sentence_len, with_labels=False):
        merger = MWETokenizer([('<', 'unk', '>')], separator = '')
        sentence = word_tokenize(string.strip())       # tokenize sentence
        sentence = merger.tokenize(sentence)         # merge <unk>
        if with_labels:
            sentence = sentence[1:]
        sentence = [token.lower() for token in sentence]
        sentence = sentence[:max_sentence_len - 2]   # cut sentence at max_sentence_length
        sentence = ['<sos>'] + sentence + ['<eos>']  # add start and end-of-sentence tags

        # pad the rest of the sentence
        padded_sentence = sentence.copy()
        padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence)))

        return sentence, padded_sentence
Esempio n. 15
0
def multiword_tokenize(text, mwe):
    # Initialize the MWETokenizer
    protected_tuples = [word_tokenize(word) for word in mwe]
    protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
    tokenizer = MWETokenizer(protected_tuples)
    # Tokenize the text.
    #tokenized_text = tokenizer.tokenize(word_tokenize(text,language='French'))
    #print(tokenize(text))
    tokenized_text = tokenizer.tokenize(tokenize(text))

    #print(tokenized_text)
    # Replace the underscored protected words with the original MWE
    for i, token in enumerate(tokenized_text):
        if token in protected_tuples_underscore:
            tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
    return tokenized_text
Esempio n. 16
0
def merge_task(task_list, args):
    with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f:
        raw_list = f.read()
    f.close()

    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    entityset = set(raw_list.split('\n'))

    tokenizer = MWETokenizer(separator=' ')

    for e in entityset:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    print("successfully read entity file and initialized tokenizer")
    sys.stdout.flush()

    for fname in task_list:
        outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1])
        context = []

        with open('{}/{}'.format(args.input_dir,fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=30):
            item_dict = json.loads(item)
            sent = nltk.word_tokenize(item_dict['text'])
            raw_tokenized = tokenizer.tokenize(sent)
            tokenized_set = set(raw_tokenized)
            mentioned_entity = list(tokenized_set.intersection(entityset))
            if len(mentioned_entity) != 0:
                doc = nlp(item_dict['text'])
                item_dict.update({'entityMentioned':mentioned_entity})
                unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)]
                item_dict['unigram'] = unigram
                tokens = [token.text for token in doc]
                pos = [token.pos_ for token in doc]
                phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos)
                item_dict['phrases'] = list(phrases['counts'])
                context.append(json.dumps(item_dict))

        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
Esempio n. 17
0
def k_tokenizer(text):
    text = text.encode('ascii', errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]
    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([
        ('ios', '9'),
    ])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)
    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """
    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc = []
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(
            token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
Esempio n. 18
0
def k_tokenizer(text):
    text = text.encode('ascii',errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]

    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([('ios', '9'),])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)

    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """

    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc=[]
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
    def processText(self):
        stop_words = set(stopwords.words('english'))
        # tokenizer = nltk.RegexpTokenizer(r"\w+")

        tokenizer = MWETokenizer([('web', 'framework'), ('file', 'system'),
                                  ('command', 'line')])  # 针对短语
        text_process = TextToWord.removeUseless(self, self.text)
        # text_token = TextToWord.lower(self,tokenizer.tokenize(text_process)) # 转成 token  不应该先lower,因为pos 的时候会根据大写判断 专有名词
        text_token = tokenizer.tokenize(word_tokenize(text_process))
        text_token = TextToWord.lower(self, text_token)
        filtered_sentence = [w for w in text_token if not w in stop_words]
        # print(filtered_sentence)

        tagged_sent = pos_tag(filtered_sentence)  # 获取单词词性
        wnl = WordNetLemmatizer()
        text_processed = []
        custom_dictionary = ['fs']
        for tag in tagged_sent:
            if tag[0] in custom_dictionary:
                wordnet_pos = 'NNP'
            else:
                wordnet_pos = TextToWord.get_wordnet_pos(
                    self, tag[1]) or wordnet.NOUN
            if wordnet_pos == 'NNP':  # 专有名词不进行还原
                text_processed.append(tag[0])
            else:
                text_processed.append(wnl.lemmatize(tag[0],
                                                    pos=wordnet_pos))  # 词形还原

        with open(
                'C:/Users/Admin/Documents/我的坚果云/NPM_Cate/material/delete-words',
                encoding="utf-8") as files:
            delete_words_list = files.read()
        delete_words = delete_words_list.split()

        # with open('C:/Users/Admin/Documents/我的坚果云/NPM_Cate/material/uninformative-words.json', 'r') as json_file:
        #     uninformative_words = json.load(json_file)
        #
        # delete_words = set(delete_words + uninformative_words)

        text_processed = [w for w in text_processed if not w in delete_words]

        return text_processed  # 最后小写化
Esempio n. 20
0
def main():

    # Testing purposes

    test_sentence1 = "merhaba, [email protected] <html>!! www.abc.com #hello selam# nasılsınız:  Milli Eğitim Bakanlığı 2.01.1997'de 20:02'de aradı"
    test_sentence2 = "www.assignment.com.tr adresine gir. [email protected] a Dr. hanıma mail at."
    test_sentence3 = "bizi new jersey bekler"

    # Multiword expressions
    # Test for including multiword expressions:
    mwe = MWETokenizer([('Milli', 'Eğitim', 'Bakanlığı'),
                        ('Bilkent', 'Üniversitesi')],
                       separator='_')

    tokenizer = RuleBasedTokenizer()

    list_of_tokens = tokenizer.tokenize(test_sentence2)
    mwe_list_of_tokens = mwe.tokenize(list_of_tokens)

    print(list_of_tokens)
def vectorize(patient):

    tokenizer = MWETokenizer([("bleeding", "gum"), ("chest", "pain"),
                              ("abdominal", "pain"), ("muscle", "pain"),
                              ("joint", "pain"), ("eye", "pain"),
                              ("nerve", "pain"), ("ligament", "pain"),
                              ("tendon", "pain"), ("bleeding", "nose")])

    dict = [
        "headach", "vomit", "nausea", "bleeding_gum", "itch", "rash", "fever",
        "diarrhea", "discomfort", "chest_pain", "abdominal_pain", "fatigu",
        "muscle_pain", "chill", "eye_pain", "joint_pain", "nerve_pain",
        "ligament_pain", "tendon_pain", "bleeding_nos"
    ]
    dict2 = [
        "headache", "vomit", "nausea", "bleeding_gum", "itch", "rash", "fever",
        "diarrhea", "discomfort", "chest_pain", "abdominal_pain", "fatigue",
        "muscle_pain", "chill", "eye_pain", "joint_pain", "nerve_pain",
        "ligament_pain", "tendon_pain", "bleeding_nose"
    ]
    synonyms_dict = [get_synonyms(dict2[x]) for x in range(len(dict2))]
    tokens = tokenizer.tokenize(word_tokenize(patient))
    ps = PorterStemmer()
    modified_tokens = [ps.stem(word) for word in tokens]
    #print(modified_tokens)

    token_set = []
    arra = [0 for x in range(len(dict))]

    for word in modified_tokens:
        for x in range(len(dict)):
            if word == dict[x]:
                token_set.append(word)
                arra[x] = 1

            else:
                for x in range(len(dict)):
                    if word in synonyms_dict[x]:
                        token_set.append(word)
                        arra[x] = 1
    return arra
Esempio n. 22
0
def trim_bio(text):

    # keywords to return
    keywords = []

    ## define important words
    #important_words = [ ["data", "science"],
    #                    ["data", "scientist"],
    #                    ["machine", "learning"],
    #                    ["data", "engineer"],
    #                    ["data", "analytics"],
    #                    ["artificial", "intelligence"],
    #                    ["ai"], ["phd"], ["founder"], ["professor"],["candidate"],["ceo"],
    #                    ["student"], ["engineer"], ["computer", "science"]
    #                    ]

    # load from file after custom edit
    df_keyword = pd.read_csv("data/keywords/df.csv")

    ## convert df to list
    important_words = df_keyword["Unnamed: 0"].tolist()

    ## format important words so that they can be registered to tokenizer
    important_words = [x.split() for x in important_words]

    # initialize tokenizer
    tokenizer = MWETokenizer()
    for iw in important_words:
        tokenizer.add_mwe([x for x in iw])  # add important words
        #tokenizer.add_mwe(iw)  # add important words

    # tokenize bio
    tokens = tokenizer.tokenize([word.lower() for word in text.split()])

    # find important words from tokens, append it to keyword
    for iw in important_words:
        iw_joined = "_".join(iw)
        if (iw_joined in tokens):
            keywords.append(iw_joined)

    return keywords
Esempio n. 23
0
def suggest_commplete(inSentence):
    suggestion_sentences = []
    
    tokenizer = MWETokenizer([('hors', "d'oeuvre"), ('program', 'me')], separator='+') # to define the words separated by spaces.
    personal_dictionary = os.path.abspath(os.path.join(CUR_DIR, 'resources', 'sg_words.txt'))
    d = enchant.DictWithPWL("en_US", personal_dictionary)
    
    new_sentence = inSentence
    meaning_words = tokenizer.tokenize(inSentence.split())
    
    for word in meaning_words:
        if (word in [".", "?", ","]):
            continue
        word = re.sub(r"[(),!?\'\`@-]", "", word)
        if (not d.check(word)):
            new_words = d.suggest(word)
            for new_word in new_words:
                new_sentence = inSentence.replace(word, new_word)
                suggestion_sentences.append(new_sentence)
    
    return suggestion_sentences
def Tokenize(text):
    tokenizer = MWETokenizer(category.all())
    for word in category:
        if word.find(' '):
            tokenizer.add_mwe(word.split())
    for word in sub_category:
        if word.find(' '):
            tokenizer.add_mwe(word.split())
    for word in brand:
        if word.find(' '):
            tokenizer.add_mwe(word.split())
    for word in article:
        if word.find(' '):
            tokenizer.add_mwe(word.split())

    token = tokenizer.tokenize(text.split())
    tokens = []
    for word in token:
        word = word.replace("_", " ")
        tokens.append(word)
    return tokens
Esempio n. 25
0
def text_process(text):
    #number removal
    if text == -2:
        return ''

    body = re.sub(r'\d+', '', text)

    #punctuation removal i.e. [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]
    #     punc = string.punctuation
    #     punct_mapping = {"_":" ", "'":" "}
    #     punc += "“”’"
    punc = "/-'?!,#$%\'()*+-/:;<=>@\\^_`{|}~[]" + '""“”’'

    #     punc = re.sub("-","", punc)
    body = body.translate(body.maketrans(punc, " " * len(punc)))

    #text lower
    body = body.lower()

    #multi-word tokenize
    multi_word_list = [('north', 'korea'), ('south', 'korea'),
                       ('north', 'korean'), ('south', 'korean'),
                       ('kim', 'jong', 'un'), ('park', 'geun', 'hye')]
    tokenizer = MWETokenizer()
    for mw in multi_word_list:
        tokenizer.add_mwe(mw)
    text = tokenizer.tokenize(body.split())

    #stopwort removal
    stopset = set(stopwords.words('english'))
    #     text = word_tokenize(body)
    text = [x for x in text if x not in stopset]
    text = [word for word in text if len(word) > 3]

    #lemmatization
    lemmatizer = WordNetLemmatizer()
    lemma_text = [lemmatizer.lemmatize(x) for x in text]

    return lemma_text
def merge_task(task_list, args):
    with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f:
        raw_list = f.read()
    f.close()

    entityset = set(raw_list.split('\n'))

    tokenizer = MWETokenizer(separator=' ')

    for e in entityset:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    print("successfully read entity file and initialized tokenizer")
    sys.stdout.flush()

    for fname in task_list:
        outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1])
        context = []

        with open('{}/{}'.format(args.input_dir,fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=30):
            item_dict = json.loads(item)
            if set(item_dict['nsubj']).issubset(pronoun) or item_dict['nsubj'] == []:
                continue
            sent = nltk.word_tokenize(item_dict['text'])
            raw_tokenized = tokenizer.tokenize(sent)
            tokenized_set = set(raw_tokenized)
            mentioned_entity = list(tokenized_set.intersection(entityset))
            if len(mentioned_entity) != 0:
                item_dict.update({'entityMentioned':mentioned_entity})
                item_dict['iid'] = '{}{}{}'.format(item_dict['did'],item_dict['pid'],item_dict['sid'])
                context.append(json.dumps(item_dict))

        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
Esempio n. 27
0
 def remove_words_tuples_corpus(self,doclist,updated_ngramlist,filter_words):
     
     """ Removes those words and ngrams(tuples) from given filter word list(which contains words and ngram tuples to be removed) 
     Parameters
     ----------
     doclist: list
         list of word tokenized document list which may or may not be mwetokenized with ngrams
     updated_ngramlist: list
         list which contains ngrams which have been updated by having ngram to be filtered removed using remove_words_from_ngramlist method
     filter_words: list
         list which contains words and ngrams(tuples) to be filtered from document list
         
     Returns
     -------
     doclist1: list
         list of filtered documents
     """          
     
     #  accepts doclist which hasnt been updated with mwetokenizer yet              
     # But if it already has been mwetokenized and ngram have been introduced with _ then
     
     # convert ngram concatenated by _ to tuples first
     doclist0 =[ [ tuple(token.split('_')) if '_' in token else token for token in doc ] for doc in list_tokens]
    
     #removing required tokens  from bigram list
     print('total lenth of doclist = ',len(doclist0))
     print('total words in filter list',len(filter_words))
     start_time = time.time()
     filter_words = set(filter_words)        
     doclist1 = [[z for z in doc if z not in filter_words] for doc in doclist0]  
     print("--- %s seconds for removal ---" % (time.time() - start_time))
 
     #combining the filtered bigram list and doclist using mwetokenizer 
     start_time = time.time()
     mwe_tokenizer = MWETokenizer(updated_ngramlist)
     doclist1 = [mwe_tokenizer.tokenize(doc) for doc in doclist1]
     print("--- %s seconds for MWE ---" % (time.time() - start_time))
     return doclist1
Esempio n. 28
0
def train_phrase_model_to_xlsx(corpus_file, out_file, phrases):
    raw = ' '.join([
        line.strip('\n').lower()
        for line in open(corpus_file, 'r', encoding='UTF-8').readlines()
    ])
    raw = re.sub(r"[{}]+".format(punctuation), '', raw)
    word_tokens = word_tokenize(raw)

    tokenizer = MWETokenizer([word_tokenize(phrase) for phrase in phrases],
                             separator='_')
    word_tokens = tokenizer.tokenize(word_tokens)

    stop_words = set(stopwords.words('english'))
    word_tokens = [token for token in word_tokens if token not in stop_words]

    model = word2vec.Word2Vec([word_tokens],
                              size=100,
                              window=5,
                              min_count=1,
                              workers=5)
    df = pd.DataFrame([model.wv.get_vector(word) for word in model.wv.vocab],
                      index=model.wv.vocab)
    df.to_excel(out_file)
Esempio n. 29
0
def get_instances(text: str, idx: int = -1) -> List[Instance]:
    """
    Return all candidate instances from the given marked text.
    A candidate instance must either be directly marked or (contain only titled-case words and have <= 3 words)

    :param text: marked text, each entity is marked by <p>...</p>
    :param idx: file index
    :return: a list of instances
    """
    instances = []

    tokenizer = MWETokenizer([('<', 'p', '>'), ('<', '/p', '>')], separator='')

    for sent in sent_tokenize(text):
        tok_w_label = tokenizer.tokenize(word_tokenize(sent))
        tok, label = extract_label(tok_w_label)
        pos = [get_simple_pos(t[1]) for t in pos_tag(tok)]

        assert len(pos) == len(tok)

        instances += _get_instances(tok, pos, label, idx)

    return instances
Esempio n. 30
0
 def get_context(self, query_str, text, k=10):
     if query_str in text:
         tokenizer = MWETokenizer()
         query_str_tokens = tuple(query_str.split())
         query_str_dashed = "_".join(query_str_tokens)
         tokenizer.add_mwe(query_str_tokens)
         text_token = tokenizer.tokenize(text.split())
         try:
             t_start = text_token.index(query_str_dashed)
         except:
             return None, None, None
         t_end = t_start + 1
         start_index = max(t_start - k, 0)
         end_index = min(t_end + k, len(text_token))
         text_token_query = text_token[start_index:t_start] + text_token[
             t_end + 1:end_index]
         context = " ".join(text_token_query)
         context_mention = text_token[start_index:t_start] + [
             query_str
         ] + text_token[t_end + 1:end_index]
         context_mention = " ".join(context_mention)
         return context, text_token_query, context_mention
     else:
         return None, None, None
Esempio n. 31
0
flags = reduce(lambda ls, d: ls + d["flags"], states, [])
tokenizer = MWETokenizer(flags)
# transcript = transcript.splitlines() # need for text block
# print(flags)
# print(transcript)

## Add flags in the token representation for each state
for state in states:
    state["token_flags"] = list(map(lambda phr: '_'.join(phr), state["flags"]))

for trans in transcript:
    line = trans["transcript"]
    line_no_punct = line.translate(str.maketrans('', '', PUNCT)).lower()

    tokens = tokenizer.tokenize(line_no_punct.split())

    ls = list(
        map(
            lambda tok: reduce(
                lambda p_res, s: (tok, s["state"]) if
                (tok in s["token_flags"]) else p_res, states, None), tokens))

    states_found = list(filter(None, ls))

    if (states_found != []):
        timing = (trans["word_timings"][0]["start_time"],
                  trans["word_timings"][-1]["end_time"])
        print('\n', line, '\n', states_found, timing)

    # print(line)
Esempio n. 32
0
    #basic search: searches for food words within tw_sentence
    #match = re.search(r"[Cc]loud.?\W+(?:\w+\W+){0,4}?[Tt]aste.?(?:\w+\W+){0,4}?[Ll]ike[^.](?:\w+\W+)[^.]*", tw_sentence)

    # basic2 search: searches for food words within tw_sentence in two directions
    match = re.search(r"[^\.\!\?\n]*(?:[Tt]aste.?\W+(?:\w+\W+){0,2}?[Ll]ike[\s][Cc]loud.?|[Cc]loud.?\W+(?:\w+\W+){0,2}?[Tt]aste.?\W+(?:\w+\W+){0,2}?[Ll]ike.?\W+(?:\w+\W+))[^\.\!\?\n]*", tw_sentence) #only output sentences that have the phrase clouds taste like <food from database>

    #exception handling
    try:
        phrase = match.group()
    except:
        phrase = None

    if phrase:
        phrase = phrase.lower()
        ph_tokens = word_tokenize(phrase)
        mwe_tokens = mwe_tokenizer.tokenize((phrase).split())

        for mwe_token in mwe_tokens:
            for list_join_wd_permutation in list_join_wd_permutations:
                if mwe_token == list_join_wd_permutation:
                    print(mwe_tokens)

                    if not list_join_wd_permutation in counter:
                        print('Adding new food to dictionary...')

                        counter[list_join_wd_permutation] = 1
                    else:
                        print('Incrementing existing food in dictionary...')
                        counter[list_join_wd_permutation] += 1
                    print("Dictionary is: ", counter)
                    print('---')
Esempio n. 33
0
    def extract_expressions(self, document, features=None):
        """Returns expressions from given features and multi-word expressions.
        
        In addition to passing a document into this method, MWEs or Multi-Word Expressions
        can be given to treat some multi words as one expression.

        >>> from document import ArthurDocument
        >>> pdf_path = base_path + '/test/test.pdf'
        >>> with open(pdf_path, 'rb') as f:
        ...     document = ArthurDocument(f.read())
        >>> features = document.get_features()[730:816,:]
        >>> print(document.get_text(features)) # doctest:+ELLIPSIS
        VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive

        Multi-word expression should be detected:
        >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates'])
        >>> expressions = clusterer.extract_expressions(document, features)
        >>> print(expressions[2]['text'])
        CROWN JEWEL

        x position should equal x of "C" from "CROWN JEWEL" :
        >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')]
        True

        and width should equal to width of "CROWN JEWEL":
        >>> expr_width = expressions[2]['x1']-expressions[2]['x']
        >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')]
        >>> expr_width == ftr_width
        True

        Args:
            document(ArthurDocument): Document to extract data fields from.
            features(list): List of features containing data fields to extract. If not given, use
                            all document features.
            mwes(list): List of Multi-Word Expressions. Example value:
                        `['property type', 'single family)]`. With that list, both "property type"
                        and "single family" will each be treated as single expressions.        
        Returns:
            np.array: An array of data_fields.
        """
        mwes = self.mwes
        if features is None:
            features = document.get_features()
        text = document.get_text(features)
        for idx, mwe in enumerate(mwes):
            if isinstance(mwe, str):
                mwes[idx] = word_tokenize(mwe.lower())
            elif hasattr(mwe, '__iter__'):
                mwes[idx] = [x.lower() for x in mwe]
        tokenizer = MWETokenizer(mwes, separator=' ')
        tokenized = tokenizer.tokenize(word_tokenize(text.lower()))

        expressions = []
        pos = 0
        for token in tokenized:
            # token could be "deez nutz" but text contains multiple spaces e.g. "deez  nutz",
            # so we need to split the token and find position of first and last characters.
            words = token.split()
            start_pos = text.lower().index(words[0], pos)
            for word in words:
                ipos = text.lower().index(word, pos)
                end_pos = ipos + len(word)
            pos = end_pos
            min_x = 0
            max_x = 0
            min_y = 0
            max_y = 0
            page = 0
            if len(features[start_pos:end_pos,:] > 0):
                min_x =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')]
                max_x =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')]
                min_y =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')]
                max_y =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')]
                page = features[start_pos, ArthurDocument.get_feature_id('page')]

            expressions.append({
                'text': text[start_pos:end_pos],
                'x': min_x,
                'x1': max_x,
                'y': min_y,
                'y1': max_y,
                'page': page
            })
        return expressions
Esempio n. 34
0
		tool.append((0, 1))
	abstracts.append(data[i]['abstract'] + ' ' + data[i]['title'])

	abstracts[i] = re.sub(r'https?:\/\/github\.com\S*', 'githuburl', abstracts[i])
	abstracts[i] = re.sub(r'https?:\/\/bioconductor\.org\S*', 'bioconductorurl', abstracts[i])
	abstracts[i] = re.sub(r'https?:\/\/sourceforge\.net\S*', 'sourceforgeurl', abstracts[i])
	abstracts[i] = re.sub(r'https?:\/\/bitbucket\.org\S*', 'bitbucketurl', abstracts[i])
	# remove all other urls
	abstracts[i] = re.sub(r'https?:\/\/\S*', 'url', abstracts[i])
	abstracts[i] = re.sub(r'www\.\S*', 'url', abstracts[i])
	# remove email addresses
	abstracts[i] = re.sub(r'\S*@\S*', ' ', abstracts[i])
	# change slashes to spaces
	abstracts[i] = re.sub(r'\/', ' ', abstracts[i])
	abstracts[i] = word_tokenize(abstracts[i])
	abstracts[i] = tokenizer.tokenize(abstracts[i])

# filter stopwords and punctuation
stopwords = nltk.corpus.stopwords.words('english')

# lemmatize words
#es = EnglishStemmer()



# dictionary created to be used later when preprocessing testing data
dictionary = set()
fdist = FreqDist()
#if not args.train:
with open('dict.csv', 'r', newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
Esempio n. 35
0
    ps = PorterStemmer()
    print("Stemming:")
    for word in filter(lambda a: " " not in a, words):
        print("{} => {} / {} / {}".format(word, lmtzr.lemmatize(word), st.stem(word), ps.stem(word)))
        sentiment_bag.add(word)
        sentiment_bag.add(st.stem(word))  # I like this one the best

    # Process all the lists
    for (label, files) in sorted(makecloud.TRANSCRIPTS.items()):
        scores = []
        print("{}:\n{}=".format(label, "=" * len(label)))
        target_words = []

        for fname in files:
            scount = 0
            tokens = word_tokenize(raw(fname))
            tokens = mwe_tokenizer.tokenize(tokens)
            tokens = list(filter(st.stem, tokens))
            bar = ""
            for t in tokens:
                if t in sentiment_bag:
                    bar += "*"
                    scount += 1
                    target_words.append(t)
            score = scount / len(tokens)
            print("{:35s}  {:3.6f} {:4} {}".format(os.path.basename(fname), score, scount, bar))
            scores.append(score)
        print("\n{} Average Score of {}: {:3.6f}".format(str.capitalize(label), len(scores), sum(scores) / len(scores)))
        print("\n\n")
        makecloud.cloud_for_document(outfile=label + ".png", fulltext=" ".join(target_words))
Esempio n. 36
0
def umbc_sim (title1, title2):
    '''
    compares the similarity of title1 and title2
    :param title1:
    :param title2:
    :return: a bool value, 0 for similar, 1 for not similar
    '''
    #print datetime.now(), " Preprocessing titles..."
    title1 = title_prepocessing(title1)
    title2 = title_prepocessing(title2)
    #print datetime.now(), " Tokenization and parsing starts..."
    tokenizer = MWETokenizer(wn_bst.multi_words_xpn())
    tokens1 = tokenizer.tokenize(title1.split())
    #print datetime.now(), " First title tokenized."
    tagged1 = nltk.pos_tag(tokens1)
    #print datetime.now(), " First title parsed."
    tokens2 = tokenizer.tokenize(title2.split())
    #print datetime.now(), " Second title tokenized."
    tagged2 = nltk.pos_tag(tokens2)
    #print datetime.now(), " Second title parsed."
    # remove tokens that are not supported by WordNet
    tagged1 = [x for x in tagged1 if not wn_bst.get_wordnet_pos(x[1])=='']
    tagged2 = [x for x in tagged2 if not wn_bst.get_wordnet_pos(x[1])=='']
    #print datetime.now(), " Tokens cleaned."

    # use a matrix to store the result for later use
    #print datetime.now(), " Building matrix..."
    len1 = len(tagged1)
    len2 = len(tagged2)
    Matrix = np.zeros((len2,len1))
    result1 = {}
    result2 = {}
    for x in range(len1):
        token1=tagged1[x][0]
        pos1 = tagged1[x][1]
        simi = 0
        counterpart1 = ''
        for y in range(len2):
            token2 = tagged2[y][0]
            pos2 = tagged2[y][1]
            Matrix[y, x] = sim(token1, pos1, token2, pos2)
            if Matrix[y,x]>simi:
                simi = Matrix[y, x]
                counterpart1 = token2
        penalty1 = umbc_penalty(token1, pos1, tokens1, simi, counterpart1)
        result1[token1] = {'sim':simi, 'p':penalty1, 'counter':counterpart1}
    #print datetime.now(), " Title1 result calculated..."
    for y in range (0, len2):
        token2=tagged2[y][0]
        pos2 = tagged2[y][1]
        simi = 0
        counterpart2 = ''
        for x in range(0, len1):
            if Matrix[y,x]>simi:
                simi = Matrix[y,x]
                counterpart2 = tagged1[x][0]
                #print token2, counterpart2, simi
        penalty2 = umbc_penalty(token2, pos2, tokens2, simi, counterpart2)
        result2[token2] = {'sim':simi, 'p':penalty2, 'counter':counterpart2}
    #print datetime.now(), " Title2 result calculated..."
    #print result1
    sum1 = umbc_sum(result1)
    sum1 = float(sum1)
    #print result2
    sum2 = umbc_sum(result2)
    sum2 = float(sum2)
    #print sum1, sum2
    score = sum1/(2*len1)+sum2/(2*len2)
    #cut upper and lower bound
    if score < 0:
        score = 0


    return score