Beispiel #1
0
 def get_context(self, query_str, text, k=10):
     if query_str in text:
         tokenizer = MWETokenizer()
         query_str_tokens = tuple(query_str.split())
         query_str_dashed = "_".join(query_str_tokens)
         tokenizer.add_mwe(query_str_tokens)
         text_token = tokenizer.tokenize(text.split())
         try:
             t_start = text_token.index(query_str_dashed)
         except:
             return None, None, None
         t_end = t_start + 1
         start_index = max(t_start - k, 0)
         end_index = min(t_end + k, len(text_token))
         text_token_query = text_token[start_index:t_start] + text_token[
             t_end + 1:end_index]
         context = " ".join(text_token_query)
         context_mention = text_token[start_index:t_start] + [
             query_str
         ] + text_token[t_end + 1:end_index]
         context_mention = " ".join(context_mention)
         return context, text_token_query, context_mention
     else:
         logging.info('error, query not in text')
         return None, None, None
Beispiel #2
0
    def __init__(self, locations):
        self.tokenizer = MWETokenizer()
        self.time_tagger = TimeTagger()
        for a in locations:
            self.tokenizer.add_mwe(a.split())
        # Rules defined
        self.specials = {
            "ACTIVITY": activities.union(["driving", "flight"]),
            "REGION": regions,
            "KEYWORD": [word for word in all_keywords if ' ' in word],
            "LOCATION": locations,
            "QUANTITY": ["at least", "more than", "less than", "at most",
                         "not more than", "a number of"],
            "IN": ["in front of", "called"],
            "NN": [phrase.replace("_", " ") for phrase in list(phrases.keys())],
            "SPACE": ["living room", "fastfood restaurant", "restaurant kitchen", "restaurant",
                      "dining hall", "food court", "butchers shop", "restaurant patio",
                      "coffee shop", "room", "hotel room", "kitchen", "office",
                      "airport", "salon"],
            "POSITION": ["side", "foreground", "background", "right", "left",
                         "image"],
            "TOBE": ["am", "is", "are", "be", "is being", "am being", "are being", "being"],
            "WAS": ["was", "were", "had been", "have been"],
            "TIMEPREP": ["prior to", "then", "earlier than", "later than", "sooner than"],
            "POSITION_PREP": ["near", "distance to"],

        }
        for tag in self.specials:
            for keyword in self.specials[tag]:
                if ' ' in keyword:
                    self.tokenizer.add_mwe(keyword.split())
Beispiel #3
0
    def timexTagAndTokenizeText(self, altText=None):
        """In this method, two steps are required, so if altText is specified, all steps are done inside the if statement, so incorrect dict entries aren't stored"""
        if altText is not None:
            raw = altText
            altOutputStep1 = self.timexTagText(raw)
            altOutputStep2 = self.wordTokenizeText(altOutputStep1)
            time_tagged_and_tokenizedText = MWETokenizer(
                mwes=[('<', '/TIMEX2', '>'), ('<', 'TIMEX2', '>')],
                separator='').tokenize(altOutputStep2)

            return time_tagged_and_tokenizedText
        else:
            """Tag all temporal expressions with timex2 tags."""
            """Don't need to open file here, because it's opened in timexTagText()"""
            tagged = self.timexTagText()
            """Word-tokenize all text above"""
            word_tagged = self.wordTokenizeText(tagged)
        '''consolidate all broken apart Timex2 tags into single "words"'''
        if self.textList.get('timexTagAndTokenizeText') is None:
            self.textList['timexTagAndTokenizeText'] = [
                MWETokenizer(mwes=[('<', '/TIMEX2', '>'),
                                   ('<', 'TIMEX2', '>')],
                             separator='').tokenize(x) for x in word_tagged
            ]

        print self.textList.get('timexTagAndTokenizeText')
        return self.textList.get('timexTagAndTokenizeText')
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))
                token_lowercase = [x.lower() for x in tokens]
                tmp.append(token_lowercase)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
Beispiel #5
0
def segment(text,
            userdict_filepath="userdict2.txt",
            stopwords_filepath='stopwords.txt'):
    import nltk
    stopwords = [
        line.strip().lower() for line in open(
            stopwords_filepath, 'r', encoding='utf-8').readlines()
    ]  #
    final_list = []
    temp_list = []
    with open(userdict_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            temp_list.append(line.strip(' ').strip('\n'))
    f.close()
    temp = []
    for line in temp_list:
        for li in line.lower().split(' '):
            if len(li) != 0:
                temp.append(li.strip('\t'))
        final_list.append(tuple(temp))
        temp.clear()

    userdict_list = final_list
    tokenizer = MWETokenizer(userdict_list, separator=' ')

    seg_list = tokenizer.tokenize(
        nltk.word_tokenize(remove_symbols(text).lower()))

    seg_list_without_stopwords = []

    for word in seg_list:
        if word not in stopwords:
            if word != '\t':
                seg_list_without_stopwords.append(word)
    return seg_list_without_stopwords
Beispiel #6
0
    def __init__(self,
                 ents=None,
                 tag2ent=None,
                 collocations=special_collocations,
                 appos=collocations.appos):
        self.__tokenizer = TweetTokenizer(reduce_len=True)
        self.__collocations = collocations
        self.__tknzr = MWETokenizer(self.__collocations)

        self.__lemm = WordNetLemmatizer()
        self.__nlp = spacy.load("en_core_web_sm")
        if ents is None:
            self.__ents = {}
            if tag2ent is not None:
                raise ValueError(
                    "ent2tag and ents should be None or not None both")
            self.__tag2ent = {}
        else:
            if tag2ent is None:
                raise ValueError(
                    "ent2tag and ents should be None or not None both")
            self.__ents = ents
            self.__tag2ent = tag2ent
        self.__appos = appos
        for a in appos:
            self.__appos[a] = '_'.join(self.__appos[a].split())

        self.__punctuation = punctuation + "“”‘’‚"
        self.__stop_symbols = '←↓→↑'
Beispiel #7
0
def trim_bio(text):

    # keywords to return
    keywords = []

    # load from file after custom edit
    df_keyword = pd.read_csv(local_data + "data/keywords/df.csv")

    ## convert df to list
    important_words = df_keyword["Unnamed: 0"].tolist()

    ## format important words so that they can be registered to tokenizer
    important_words = [x.split() for x in important_words]

    # initialize tokenizer
    tokenizer = MWETokenizer()
    for iw in important_words:
        tokenizer.add_mwe([x for x in iw])  # add important words
        #tokenizer.add_mwe(iw)  # add important words

    # tokenize bio
    tokens = tokenizer.tokenize([word.lower() for word in text.split()])

    # find important words from tokens, append it to keyword
    for iw in important_words:
        iw_joined = "_".join(iw)
        if (iw_joined in tokens):
            keywords.append(iw_joined)

    return keywords
Beispiel #8
0
 def __init__(self, filename):
     """initializes a LyricsCleaner object"""
     self._filename = filename
     self._tokenizer = MWETokenizer()
     for word in SIGNAL_WORDS:
         self._tokenizer.add_mwe(('[', word, ']'))
     self._stemmer = LancasterStemmer()
Beispiel #9
0
def tokenization(docs):
    documents = {}

    for doc in docs:
        document_plain = docs[doc]
        document_plain = document_plain.replace("/", "").replace("-", " ")
        #re.sub(r'\([^)]*\)', '', document_plain)
        re.sub(r'\([0-9]*\)', '', document_plain)

        relevant_words = []
        mwetokenizer = MWETokenizer()
        document_ner = spacy_nlp(document_plain)

        for element in document_ner.ents:
            # don't consider numbers
            if element.label_ not in "CARDINAL":
                relevant_words.append(element)

        #for each relevant word, if whitespace is present, create a single token with all the words
        for word in relevant_words:
            token = str(word).split()
            if len(token) > 1:
                move_data = []
                for element in token:
                    move_data.append(element)
                tup = tuple(move_data)
                mwetokenizer.add_mwe(tup)

        document_tokenized = word_tokenize(document_plain)
        document_retokenized = mwetokenizer.tokenize(document_tokenized)

        documents[doc] = document_retokenized
    return documents
Beispiel #10
0
def initialize_known_phrase_tokenization(phrases):
    from nltk.tokenize import MWETokenizer
    tokenizer = MWETokenizer()
    for phrase in phrases:
        if (phrase):
            phrase_as_list = phrase.replace("_", " ").split()
            tokenizer.add_mwe(phrase_as_list)
    return tokenizer
Beispiel #11
0
def multiword_tokenizer(token_list, bigram_list):
    """
    Tokenize a list of unigram tokens into bigram tokens,
    given a list of bigrams.
    Bigrams are separated with "__"
    """
    mwetokenizer = MWETokenizer(bigram_list, separator="__")
    return mwetokenizer.tokenize(token_list)
Beispiel #12
0
    def fit(self, X, **fit_params):
        """
        Procedure to iteratively contract bigrams (up to max_collocation_iterations times)
        that score higher on the collocation_function than the min_collocation_score (and satisfy other
        criteria set out by the optional parameters).
        """
        self.tokenization_ = X
        n_tokens = sum([len(x) for x in X])
        for i in range(self.max_iterations):
            bigramer = BigramCollocationFinder.from_documents(
                self.tokenization_)

            if not self.ignored_tokens == None:
                ignore_fn = lambda w: w in self.ignored_tokens
                bigramer.apply_word_filter(ignore_fn)

            if not self.excluded_token_regex == None:
                exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex,
                                                     w) is not None)
                bigramer.apply_word_filter(exclude_fn)

            if not self.min_token_occurrences == None:
                minocc_fn = lambda w: bigramer.word_fd[
                    w] < self.min_token_occurrences
                bigramer.apply_word_filter(minocc_fn)

            if not self.max_token_occurrences == None:
                maxocc_fn = lambda w: bigramer.word_fd[
                    w] > self.max_token_occurrences
                bigramer.apply_word_filter(maxocc_fn)

            if not self.min_token_frequency == None:
                minfreq_fn = (lambda w: bigramer.word_fd[w] < self.
                              min_token_frequency * n_tokens)
                bigramer.apply_word_filter(minfreq_fn)

            if not self.max_token_frequency == None:
                maxfreq_fn = (lambda w: bigramer.word_fd[w] > self.
                              max_token_frequency * n_tokens)
                bigramer.apply_word_filter(maxfreq_fn)

            if not self.min_ngram_occurrences == None:
                bigramer.apply_freq_filter(self.min_ngram_occurrences)

            new_grams = list(
                bigramer.above_score(self.score_function, self.min_score))

            if len(new_grams) == 0:
                break

            self.mtes_.append(new_grams)

            contracter = MWETokenizer(new_grams)
            self.tokenization_ = tuple([
                tuple(contracter.tokenize(doc)) for doc in self.tokenization_
            ])

        return self
Beispiel #13
0
def clean_text(dirty_text, external_vocab_filename=None, external_vocab_level="no"):
    """(str, str, str) -> str

    external_vocab_level can be: no, with_ngrams, only.
    if you choose with_ngrams or only, you need to add an external_vocab_filename

    The text is cleaned in the following way:
    # substitute word1.word2 by word1. word2
    # split text into rough sentences based on '<\\%>'.  This symbol was added to denote
      a new line on the original product description
    # split the rough sentences using a sentence tokenizer from nltk
    # if zappos_ngrams is True, combine the zappos ngrams into one token. e.g., short sleeves -> short_sleeves

    # concatenate all tokenized words into one string and return string.

    An excerpt of text from the item looks like this:
    Sheath/Column One-Shoulder Short/Mini Bridesmaid Dress With Lace<\\%>SKU#:1020234<\\%>New Style Cocktail Dress<\\%>
    Color:The color as picture<\\%>Category:Bridesmaid Dress<\\%> Brand:Biggoldapple<\\%>
    Silhouette:Sheath/Column<\\%> Neckline:One-Shoulder<\\%> Hemline/Train:Short/Mini<\\%>

    """

    external_vocab_words = []
    if external_vocab_filename is not None:
        external_vocab_words = load_vocab_words(external_vocab_filename)

        # transform ngrams into tuples
        external_vocab_with_tuples = [tuple(z.split('_')) for z in external_vocab_words]  # assume that ngrams are separated by underscore: word1_word2.
        # multiple word tokenizer, more info: http://www.nltk.org/api/nltk.tokenize.html
        tokenizer_mwe = MWETokenizer(external_vocab_with_tuples)  # all external_vocab_words are added

    out_clean_text = ''
    # substitute word1.word2 by word1. word2
    dirty_text = re.sub(r"(\w[A-Z]|[a-z.])\.([^.)\s])", r"\1. \2", dirty_text)
    rough_sentences = dirty_text.split('<\\%>')  # sentences based on splitting by '<\\%>'

    sentences = []
    for rs in rough_sentences:
        rs = rs.replace("3/4", "3_4")  # just to keep the 3/4 as 3_4
        sentences.extend(SENT_TOKENIZER.tokenize(rs))  # sentences based on NLTK tokenizer

    for sentence in sentences:
        words = WORD_TOKENIZER.tokenize(sentence.lower())  # tokenize based on words. ignore that zappos vocabulary exists

        if external_vocab_level == 'with_ngrams':
            # keep all words (even those not in zappos), but group zappos ngrams into one token
            words = tokenizer_mwe.tokenize(words)  # group zappos_ngrams into one token.
        elif external_vocab_level == 'only':
            words = tokenizer_mwe.tokenize(words)  # group zappos_ngrams into one token.
            words = [w for w in words if w in external_vocab_words]  # only keep words in the zappos vocabulary

        words = [w for w in words if (not w.isdigit() or w == '3_4')]  # remove words that are just digits, but leave 3_4

        words_concat = ' '.join(words) + '\n'
        out_clean_text += words_concat

    return out_clean_text
def tokenize_and_remove_punct(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    mtokenizer = MWETokenizer()
    mwe = mtokenizer.tokenize(text.split())
    words = []
    for t in mwe:
        if t.isalpha():
            words.append(t)
    return words
Beispiel #15
0
def phrase_eval(params):
    list_phrases, unigram_set, target_token, idf, agg_score, pid = params

    idf_list = [*idf]
    idf_set = set(idf_list)

    tokenizer = MWETokenizer(separator=' ')
    for e in unigram_set:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    phrases_score = {}
    for phrase in tqdm(list_phrases,
                       desc='phrase-eval-{}'.format(pid),
                       mininterval=10):
        score = 0
        tokens = nltk.word_tokenize(phrase)
        if not set(tokens).issubset(idf_set):
            continue
        nonstop_tokens = [token for token in tokens if token not in stop]
        if len(nonstop_tokens) / len(tokens) <= 0.5:
            continue
        raw_tokenized = tokenizer.tokenize(tokens)
        tokenized_set = set(raw_tokenized)
        keywords = tokenized_set.intersection(unigram_set)
        for token in keywords:
            score += agg_score[token]
        score /= (1 + np.log(len(nonstop_tokens)))

        vocab = set(target_token).union(set(tokens))
        vocab = list(vocab.intersection(idf_set))
        target_vec = [0] * len(vocab)
        phrase_vec = [0] * len(vocab)

        target_token_freq = dict(Counter(target_token))
        target_token_subset = list(set(vocab).intersection(set(target_token)))
        for token in target_token_subset:
            index = vocab.index(token)
            target_vec[index] = target_token_freq[token] / len(
                target_token) * idf[token]

        phrase_token_freq = dict(Counter(tokens))
        for token in tokens:
            index = vocab.index(token)
            phrase_vec[index] = phrase_token_freq[token] / len(
                tokens) * idf[token]

        tfidf_sim = 1 - spatial.distance.cosine(target_vec, phrase_vec)

        phrases_score.update({phrase: {'score': score, 'eval': tfidf_sim}})

    rearrange = {}
    for k, v in phrases_score.items():
        rearrange.update({k: v['score']})
    top_10 = nlargest(10, rearrange, key=rearrange.get)

    return {key: phrases_score[key] for key in top_10}
Beispiel #16
0
def tokenizer_sent(dataset):
    tokenizer = MWETokenizer()
    aspect_tokenized = []
    sentence_tokenized = []
    for i in range(0, len(dataset.index)):
        aspect_split = tuple(dataset['aspect_term'][i].lower().split())
        res = tokenizer.add_mwe(aspect_split)
        aspect_tokenized.append(res)
    for j in range(0, len(dataset.index)):
        tok = nltk.pos_tag(
            tokenizer.tokenize(dataset['text'][i].lower().split()))
        sentence_tokenized.append(tok)
Beispiel #17
0
def multiword_tokenize(text, mwe):
    # Tokenizer分离缩略词,(“Don't” =>'Do', "n't") 表句子切分的“,” "." 单独成词
    protected_tuples = [word_tokenize(word) for word in mwe]
    protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
    tokenizer = MWETokenizer(protected_tuples)
    # Tokenize the text.
    tokenized_text = tokenizer.tokenize(word_tokenize(text))
    # Replace the underscored protected words with the original MWE
    for i, token in enumerate(tokenized_text):
        if token in protected_tuples_underscore:
            tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
    return tokenized_text
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        lmtzr = WordNetLemmatizer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stop_words = stopwords.words('english')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                # For each sentence in the sentences

                # Tokenize the sentence based on Regex and then using MWETokenizer
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))

                # Lower the case of all the tokens
                token_lowercase = [x.lower() for x in tokens]

                # Lemmatize the sentence. Find the POS tags and then lemmatize
                tokens_lowecase_tagged = nltk.pos_tag(token_lowercase)
                lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged]

                # Stem the sentence
                stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence]

                # Remove the stop words
                processed_sentence = [word for word in stemmed_sentence if word not in stop_words]

                tmp.append(processed_sentence)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
    def sentence_filter(self, sentence):  # 对句子进行初步的分词和清洗

        if self.language == 'chinese':

            import jieba.posseg as psg

            return psg.cut(sentence)  # 使用jieba的分词接口直接完成分词和清洗

        elif self.language == 'english':

            from nltk.tokenize import MWETokenizer  # 使用MWE分词器

            tokenizer = MWETokenizer(self.userdict)  # 添加自定义词组,以下划线'_'为词组连接

            nlp = spacy.load('en_core_web_sm')  # 生成spacy分词器

            # for word in self.userdict:    # spacy添加自定义词语,貌似无效
            #     lex = nlp.vocab[word]

            # 清洗标点符号
            quote_double_pattern = re.compile('“|”')
            quote_single_pattern = re.compile('‘|’')
            punc_pattern = re.compile(
                "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,")

            sentence = re.sub(quote_double_pattern, '"', sentence)
            sentence = re.sub(quote_single_pattern, "'", sentence)  # 考虑's和s'的情况,不能直接删掉
            sentence = re.sub(punc_pattern, ' ', sentence)

            # 使用nltk和spacy得到分词结果,使用pke则得到完整句子
            # return nlp(' '.join(sentence.split()))    # spacy
            return nlp(' '.join(tokenizer.tokenize(sentence.lower().split())))  # nltk + spacy: 先用nltk添加词组,再用spacy分词
            # return sentence    # pke

        elif self.language == 'japanese':

            mecab = MeCab.Tagger('')  # 使用mecab的分词器直接得到结果,暂时不能添加自定义词典,有些专有名词识别不出来(如: 比特/币)

            # 清洗标点符号
            punc_pattern = re.compile(
                "\xa0|\t|\n|\:|\;| — | - |\!|\@|\#|\$|\%|\^|\&|\*|\_|\?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|/|・|「|」|•|※")
            sentence = re.sub(punc_pattern, ' ', sentence)

            sentence = [
                (
                    chunk.split('\t')[0],
                    chunk.split('\t')[1].split(',')[0]
                )
                for chunk in mecab.parse(sentence).splitlines()[:-1]
            ]  # 根据词条结构获取词根和词型

            return sentence
def text_process_group(mess):
    """
    1. Lower case the input
    2. Remove punctuation expect '-'
    3. Apply custom tokenizer
    4. Return column of clean text words"""
    mess.lower()
    regex = r"[^\P{P}-]+"
    new_mess = re.sub(regex, " ", mess, 0)
    tokenizer = MWETokenizer(all_list, separator=' ')
    token = tokenizer.tokenize(new_mess.lower().split())
    sw = [x for x in token if x not in stopwords.words('english')]
    return sw
Beispiel #21
0
 def sentence_filter(self, sentence):
     tokenizer = MWETokenizer(self.userdict)  # 添加自定义词组,以下划线'_'为词组连接
     nlp = spacy.load('en_core_web_sm')  # 生成spacy分词器
     quote_double_pattern = re.compile('“|”')
     quote_single_pattern = re.compile('‘|’')
     punc_pattern = re.compile(
         "\"|\xa0|\t|\n|\:|\;| — | - |–-|\!|\@|\#|\$|\%|\^|\*|\_|\?|?|\(|\)|\[|\]|\{|\}|\<|\>|\||\+|\=|\~|\`|°|\\|\/|,"
     )
     sentence = re.sub(quote_double_pattern, '"', sentence)
     sentence = re.sub(quote_single_pattern, "'",
                       sentence)  # 考虑's和s'的情况,不能直接删掉
     sentence = re.sub(punc_pattern, ' ', sentence)
     return nlp(' '.join(tokenizer.tokenize(
         sentence.lower().split())))  # nltk + spacy: 先用nltk添加词组,再用spacy分词
Beispiel #22
0
def multi_word_tokenizer(relevant_words, text):
    mwetokenizer = MWETokenizer()

    #add tuples of words into multiword tokenizer
    for word in relevant_words:
        token = str(word).split()
        move_data=[]
        for element in token:
            move_data.append(element)
        tup = tuple(move_data)
        mwetokenizer.add_mwe(tup)

    #execute multitokenization
    return mwetokenizer.tokenize(text)
    def tokenize_sentence(self, string, max_sentence_len, with_labels=False):
        merger = MWETokenizer([('<', 'unk', '>')], separator = '')
        sentence = word_tokenize(string.strip())       # tokenize sentence
        sentence = merger.tokenize(sentence)         # merge <unk>
        if with_labels:
            sentence = sentence[1:]
        sentence = [token.lower() for token in sentence]
        sentence = sentence[:max_sentence_len - 2]   # cut sentence at max_sentence_length
        sentence = ['<sos>'] + sentence + ['<eos>']  # add start and end-of-sentence tags

        # pad the rest of the sentence
        padded_sentence = sentence.copy()
        padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence)))

        return sentence, padded_sentence
Beispiel #24
0
def multiword_tokenize(text, mwe):
    # Initialize the MWETokenizer
    protected_tuples = [word_tokenize(word) for word in mwe]
    protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
    tokenizer = MWETokenizer(protected_tuples)
    # Tokenize the text.
    #tokenized_text = tokenizer.tokenize(word_tokenize(text,language='French'))
    #print(tokenize(text))
    tokenized_text = tokenizer.tokenize(tokenize(text))

    #print(tokenized_text)
    # Replace the underscored protected words with the original MWE
    for i, token in enumerate(tokenized_text):
        if token in protected_tuples_underscore:
            tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
    return tokenized_text
Beispiel #25
0
def spans(txt):
    tokens = MWETokenizer.tokenize(word_tokenize(txt))
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        yield token, offset, offset + len(token)
        offset += len(token)
Beispiel #26
0
    def __init__(self, address_and_gps):
        self.tokenizer = MWETokenizer()
        self.time_tagger = TimeTagger()
        for a in address_and_gps:
            self.tokenizer.add_mwe(a.split())
        # Rules defined
        self.specials = {
            "QUANTITY": [
                "at least", "more than", "less than", "at most",
                "not more than", "a number of"
            ],
            "IN": ["in front of", "called"],
            "NN": [
                "cafe sign", "traffic light", "fire hydrant", "stop sign",
                "parking meter", "baseball bat", "baseball glove",
                "cell phone", "teddy bear", "hair drier"
                "airport vehicles", "airport vehicle", "screen"
            ],
            "SPACE": [
                "living room", "fastfood restaurant", "restaurant kitchen",
                "restaurant", "dining hall", "food court", "butchers shop",
                "restaurant patio", "coffee shop", "room", "hotel room",
                "kitchen", "office", "airport", "salon"
            ],
            "POSITION":
            ["side", "foreground", "background", "right", "left", "image"],
            "LOCATION": [
                "home", "school", "oslo", "norway", "hotel", "tromso", "bank",
                "ireland", "china", "japan", "vietnam", 'dcu', 'dublin',
                'dublin city university'
            ],
            "TOBE": [
                "am", "is", "are", "be", "is being", "am being", "are being",
                "being"
            ],
            "WAS": ["was", "were", "had been", "have been"],
            "TIMEPREP": ["prior to", "then"],
            "POSITION_PREP": ["near", "distance to"]
        }

        for tag in [
                "QUANTITY", "IN", "NN", "SPACE", "POSITION", "LOCATION",
                "TOBE", "WAS", "TIMEPREP", "POSITION_PREP"
        ]:
            for keyword in self.specials[tag]:
                if ' ' in keyword:
                    self.tokenizer.add_mwe(keyword.split())
    def _init_mwe_tokenizer(self):
        def multi_word_expressions():
            for entity in self.vocab:
                if entity.find(self._PHRASE_DELIMITER) != -1:
                    yield entity.split(self._PHRASE_DELIMITER)

        it = multi_word_expressions()
        self._mwe_tokenizer = MWETokenizer(it)
Beispiel #28
0
def k_tokenizer(text):
    text = text.encode('ascii',errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]

    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([('ios', '9'),])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)

    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """

    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc=[]
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
Beispiel #29
0
def k_tokenizer(text):
    text = text.encode('ascii', errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]
    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([
        ('ios', '9'),
    ])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)
    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """
    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc = []
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(
            token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
Beispiel #30
0
def tokenize(multi_word_queries, text):
    """Returns a list of words that make up the text.
    Params: {text: String}
    Returns: List
    """
    lower_case = text.lower()
    tokenizer = RegexpTokenizer(
        'not\s+very\s+[a-z]+|not\s+[a-z]+|no\s+[a-z]+|[a-z]+')
    result = tokenizer.tokenize(lower_case)
    multi_tokenizer = MWETokenizer([('working', 'out'), ('coffee', 'shops'),
                                    ('average', 'prices'), ('union', 'square'),
                                    ('real', 'estate'), ('ice', 'cream'),
                                    ('whole', 'foods'), ('co', 'op'),
                                    ('wall', 'street'), ('world', 'trade'),
                                    ('high', 'school'), ('dim', 'sum'),
                                    ('empire', 'state'), ('high', 'rise'),
                                    ('walk', 'ups')])
    if len(multi_word_queries) > 0:
        for tok in multi_word_queries:
            if (len(tok.split('_')) > 1):
                multi_tokenizer.add_mwe(tuple(tok.split('_')))
    #add neighborhood names
    for n in neighborhood_name_phrases:
        multi_tokenizer.add_mwe(tuple(n.split('_')))

    result2 = multi_tokenizer.tokenize(result)
    return result2
Beispiel #31
0
def merge_task(task_list, args):
    with open('{}/wiki_quality.txt'.format(args.entity_dir), 'r') as f:
        raw_list = f.read()
    f.close()

    nlp = spacy.load('en_core_web_lg', disable=['ner'])

    entityset = set(raw_list.split('\n'))

    tokenizer = MWETokenizer(separator=' ')

    for e in entityset:
        tokenizer.add_mwe(nltk.word_tokenize(e))

    print("successfully read entity file and initialized tokenizer")
    sys.stdout.flush()

    for fname in task_list:
        outputname = 'SENTENCE_ENTITY_{}'.format(fname.split('_')[-1])
        context = []

        with open('{}/{}'.format(args.input_dir,fname), 'r') as f:
            doc = f.readlines()
        f.close()

        for item in tqdm(doc, desc='{}'.format(fname), mininterval=30):
            item_dict = json.loads(item)
            sent = nltk.word_tokenize(item_dict['text'])
            raw_tokenized = tokenizer.tokenize(sent)
            tokenized_set = set(raw_tokenized)
            mentioned_entity = list(tokenized_set.intersection(entityset))
            if len(mentioned_entity) != 0:
                doc = nlp(item_dict['text'])
                item_dict.update({'entityMentioned':mentioned_entity})
                unigram = [token.text for token in textacy.extract.ngrams(doc,n=1,filter_nums=True, filter_punct=True, filter_stops=True)]
                item_dict['unigram'] = unigram
                tokens = [token.text for token in doc]
                pos = [token.pos_ for token in doc]
                phrases = phrasemachine.get_phrases(tokens=tokens, postags=pos)
                item_dict['phrases'] = list(phrases['counts'])
                context.append(json.dumps(item_dict))

        with open('{}/{}'.format(args.output_dir, outputname), "w+") as f:
            f.write('\n'.join(context))
        f.close()
Beispiel #32
0
 def iteratively_contract_bigrams(self):
     """
     Procedure to iteratively contract bigrams (up to max_collocation_iterations times)
     that score higher on the collocation_function than the min_collocation_score
     """
     for i in range(self.max_collocation_iterations):
         bigramer = BigramCollocationFinder.from_documents(self.tokens_by_sent())
         mwes = list(
             bigramer.above_score(
                 self.collocation_score_function, self.min_collocation_score
             )
         )
         if len(mwes) == 0:
             break
         contracter = MWETokenizer(mwes)
         self.tokens_by_sent_by_doc_ = [
             contracter.tokenize_sents(doc) for doc in self.tokens_by_sent_by_doc()
         ]
Beispiel #33
0
            "   mean: {}  median: {}  stddev: {}".format(
                statistics.mean(counts), statistics.median(counts), statistics.stdev(counts)
            )
        )
        print("\n")

    # Read the words of interest
    words = open("emotion_words.txt").read().lower().split("\n")
    sentiment_bag = set()

    # Get the multi-word expression tokenizer and add each to the sentiment_bag
    mwe = set(filter(lambda a: " " in a, words))
    print("Multi-word expressions in emotion words: {}".format(",".join(mwe)))

    # Create the MWE tokenizer
    mwe_tokenizer = MWETokenizer()
    for s in mwe:
        print("Add mwe ", s)
        mwe_tokenizer.add_mwe(s.split(" "))
        sentiment_bag.add(s.replace(" ", "_"))

    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    ps = PorterStemmer()
    print("Stemming:")
    for word in filter(lambda a: " " not in a, words):
        print("{} => {} / {} / {}".format(word, lmtzr.lemmatize(word), st.stem(word), ps.stem(word)))
        sentiment_bag.add(word)
        sentiment_bag.add(st.stem(word))  # I like this one the best

    # Process all the lists
    def extract_expressions(self, document, features=None):
        """Returns expressions from given features and multi-word expressions.
        
        In addition to passing a document into this method, MWEs or Multi-Word Expressions
        can be given to treat some multi words as one expression.

        >>> from document import ArthurDocument
        >>> pdf_path = base_path + '/test/test.pdf'
        >>> with open(pdf_path, 'rb') as f:
        ...     document = ArthurDocument(f.read())
        >>> features = document.get_features()[730:816,:]
        >>> print(document.get_text(features)) # doctest:+ELLIPSIS
        VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive

        Multi-word expression should be detected:
        >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates'])
        >>> expressions = clusterer.extract_expressions(document, features)
        >>> print(expressions[2]['text'])
        CROWN JEWEL

        x position should equal x of "C" from "CROWN JEWEL" :
        >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')]
        True

        and width should equal to width of "CROWN JEWEL":
        >>> expr_width = expressions[2]['x1']-expressions[2]['x']
        >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')]
        >>> expr_width == ftr_width
        True

        Args:
            document(ArthurDocument): Document to extract data fields from.
            features(list): List of features containing data fields to extract. If not given, use
                            all document features.
            mwes(list): List of Multi-Word Expressions. Example value:
                        `['property type', 'single family)]`. With that list, both "property type"
                        and "single family" will each be treated as single expressions.        
        Returns:
            np.array: An array of data_fields.
        """
        mwes = self.mwes
        if features is None:
            features = document.get_features()
        text = document.get_text(features)
        for idx, mwe in enumerate(mwes):
            if isinstance(mwe, str):
                mwes[idx] = word_tokenize(mwe.lower())
            elif hasattr(mwe, '__iter__'):
                mwes[idx] = [x.lower() for x in mwe]
        tokenizer = MWETokenizer(mwes, separator=' ')
        tokenized = tokenizer.tokenize(word_tokenize(text.lower()))

        expressions = []
        pos = 0
        for token in tokenized:
            # token could be "deez nutz" but text contains multiple spaces e.g. "deez  nutz",
            # so we need to split the token and find position of first and last characters.
            words = token.split()
            start_pos = text.lower().index(words[0], pos)
            for word in words:
                ipos = text.lower().index(word, pos)
                end_pos = ipos + len(word)
            pos = end_pos
            min_x = 0
            max_x = 0
            min_y = 0
            max_y = 0
            page = 0
            if len(features[start_pos:end_pos,:] > 0):
                min_x =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')]
                max_x =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')]
                min_y =  np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')]
                max_y =  np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')]
                page = features[start_pos, ArthurDocument.get_feature_id('page')]

            expressions.append({
                'text': text[start_pos:end_pos],
                'x': min_x,
                'x1': max_x,
                'y': min_y,
                'y1': max_y,
                'page': page
            })
        return expressions
					help='Use as training data and create dictionary (default: test data)')
args = parser.parse_args()
data = [] # json output from extract.py

for line in fileinput.input(files='-'):
	data = json.loads(line)

miscFeatures = [] # other features
abstracts = []
tool = []
# tokenize the text

# pairs for conditional frequency distribution
pairs = []

tokenizer = MWETokenizer([('web', 'server'), ('software', 'package'), ('R', 'package'), ('freely', 'available'), ('source', 'code')])

for i in range(len(data)):
	# change github, bioconductor, sourceforge urls to unique words
	title = data[i]['title']
	colon = title.find(':')
	miscFeatures.append([])

	if colon >= 0:
		miscFeatures[i].append(1)

		pairs.append((i+1, '-hascolon'))
		a = title[0:colon]
		b = a.lower()
		diff = sum(a[k] != b[k] for k in range(len(a)))
		# ratio = diff/len(a)
Beispiel #36
0
def umbc_sim (title1, title2):
    '''
    compares the similarity of title1 and title2
    :param title1:
    :param title2:
    :return: a bool value, 0 for similar, 1 for not similar
    '''
    #print datetime.now(), " Preprocessing titles..."
    title1 = title_prepocessing(title1)
    title2 = title_prepocessing(title2)
    #print datetime.now(), " Tokenization and parsing starts..."
    tokenizer = MWETokenizer(wn_bst.multi_words_xpn())
    tokens1 = tokenizer.tokenize(title1.split())
    #print datetime.now(), " First title tokenized."
    tagged1 = nltk.pos_tag(tokens1)
    #print datetime.now(), " First title parsed."
    tokens2 = tokenizer.tokenize(title2.split())
    #print datetime.now(), " Second title tokenized."
    tagged2 = nltk.pos_tag(tokens2)
    #print datetime.now(), " Second title parsed."
    # remove tokens that are not supported by WordNet
    tagged1 = [x for x in tagged1 if not wn_bst.get_wordnet_pos(x[1])=='']
    tagged2 = [x for x in tagged2 if not wn_bst.get_wordnet_pos(x[1])=='']
    #print datetime.now(), " Tokens cleaned."

    # use a matrix to store the result for later use
    #print datetime.now(), " Building matrix..."
    len1 = len(tagged1)
    len2 = len(tagged2)
    Matrix = np.zeros((len2,len1))
    result1 = {}
    result2 = {}
    for x in range(len1):
        token1=tagged1[x][0]
        pos1 = tagged1[x][1]
        simi = 0
        counterpart1 = ''
        for y in range(len2):
            token2 = tagged2[y][0]
            pos2 = tagged2[y][1]
            Matrix[y, x] = sim(token1, pos1, token2, pos2)
            if Matrix[y,x]>simi:
                simi = Matrix[y, x]
                counterpart1 = token2
        penalty1 = umbc_penalty(token1, pos1, tokens1, simi, counterpart1)
        result1[token1] = {'sim':simi, 'p':penalty1, 'counter':counterpart1}
    #print datetime.now(), " Title1 result calculated..."
    for y in range (0, len2):
        token2=tagged2[y][0]
        pos2 = tagged2[y][1]
        simi = 0
        counterpart2 = ''
        for x in range(0, len1):
            if Matrix[y,x]>simi:
                simi = Matrix[y,x]
                counterpart2 = tagged1[x][0]
                #print token2, counterpart2, simi
        penalty2 = umbc_penalty(token2, pos2, tokens2, simi, counterpart2)
        result2[token2] = {'sim':simi, 'p':penalty2, 'counter':counterpart2}
    #print datetime.now(), " Title2 result calculated..."
    #print result1
    sum1 = umbc_sum(result1)
    sum1 = float(sum1)
    #print result2
    sum2 = umbc_sum(result2)
    sum2 = float(sum2)
    #print sum1, sum2
    score = sum1/(2*len1)+sum2/(2*len2)
    #cut upper and lower bound
    if score < 0:
        score = 0


    return score
Beispiel #37
0
whitespace_tokenizer = WhitespaceTokenizer()
wnl = WordNetLemmatizer()

from spacy.en import English
nlp = English()


# This is for multi-word-phrases. 
MWE = [] 
path = "/".join(os.path.realpath(__file__).split("/")[:-2]) + '/input/'
print 'path', path
with open(path+'STREUSLE2.1-mwes.tsv') as f:
    for line in f.readlines():
        multiword_expression = line.split('\t')[0].split()[1:]
        MWE.append(multiword_expression)
MWE_tokenizer = MWETokenizer(MWE, separator='-')
# Add whatever additional custom multi-word-expressions.
MWE_tokenizer.add_mwe((  'dive', 'bar'))
# Stopwords
stops = set(stopwords.words("english") + stopwords.words("spanish"))
keep_list = ['after', 'during', 'not', 'between', 'other', 'over', 'under', 
             'most', ' without', 'nor', 'no', 'very', 'against','don','aren']
stops = set([word for word in stops if word not in keep_list])


table = string.maketrans("","")



sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
whitespace_tokenizer = WhitespaceTokenizer()