Example #1
0
 def date_unique(self, nmb, start_date=None, end_date=None):
     self.date_slice1(self.df, start_date, end_date)
     self.df1_nd = deepcopy(self.df.iloc[
         np.delete(np.arange(len(self.df)), self.df1.index), :])
     self.df1.index = np.arange(len(self.df1))
     self.nmb = nmb
     self.d = ''
     det = TreebankWordDetokenizer()
     for i in range(len(self.df1)):
         self.d += ' ' + det.detokenize(self.df1['post'][i])
     for i in range(len(self.df1)):
         for j in range(len(self.df1['comment'][i])):
             self.d += ' ' + det.detokenize(self.df1['comment'][i][j])
     self.df1_nd.index = np.arange(len(self.df1_nd))
     self.nd = ''
     for i in range(len(self.df1_nd)):
         self.nd += ' ' + det.detokenize(self.df1_nd['post'][i])
     for i in range(len(self.df1_nd)):
         for j in range(len(self.df1_nd['comment'][i])):
             self.nd += ' ' + det.detokenize(self.df1_nd['comment'][i][j])
     self.ls_dt = [self.d, self.nd]
     vectorizer = TfidfVectorizer()
     self.X = vectorizer.fit_transform(self.ls_dt)
     self.tt = pd.DataFrame(self.X.toarray(),
                            columns=vectorizer.get_feature_names())
     self.dick = dict(zip(self.tt.columns, self.tt.loc[0]))
     self.utp = sorted(self.dick, key=self.dick.get, reverse=True)
     print('{} самых значимых слов для данного промежутка\n'.format(
         self.nmb))
     for i in range(nmb):
         print('{}:'.format(i + 1) + ' ' + self.utp[i])
def remove_spaCy_stop3(all_data, train_data, test_data):
    spacy_nlp = spacy.load('en')
    sw = spacy.lang.en.stop_words.STOP_WORDS
    deto = Detok()

    all_cleaned = list()
    train_cleaned = list()
    test_cleaned = list()

    for article in all_data:
        word_tokens = word_tokenize(article)
        all_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    for article in train_data:
        word_tokens = word_tokenize(article)
        train_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    for article in test_data:
        word_tokens = word_tokenize(article)
        test_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    return all_cleaned, train_cleaned, test_cleaned
Example #3
0
def getSummary(game, gen_summary):
    """
    Reads the summary from a json array and tokenizes it
    Does the same thing for the generated summary
    """
    detokenizer = TreebankWordDetokenizer()
    summary = game["summary"]

    return detokenizer.detokenize(summary), detokenizer.detokenize(gen_summary)
Example #4
0
def print_poem(poem):
    d = TreebankWordDetokenizer()

    line1, line2, line3 = _poem_to_lines(poem)
    n1 = nsyl_line(line1)
    n2 = nsyl_line(line2)
    n3 = nsyl_line(line3)
    line1 = d.detokenize(line1)
    line2 = d.detokenize(line2)
    line3 = d.detokenize(line3)
    print(str(n1) + ' | ' + line1 + '\n' + \
          str(n2) + ' | ' + line2 + '\n' + \
          str(n3) + ' | ' + line3 + '\n')
Example #5
0
    def tokeniz(self, df):

        #########COMMENTS#####################
        for i in range(len(df)):
            df["comment"][i] = list(df["comment"][i][2:-2].replace(
                "'", '').split(','))
        tw = TweetTokenizer()
        det = TreebankWordDetokenizer()
        for i in (range(len(df))):
            for j in range(len(df["comment"][i])):
                tokenized_example = (tw.tokenize(df["comment"][i][j]))
                filtered_example = [
                    word for word in tokenized_example
                    if not word in self.sum_noise
                ]
                df["comment"][i][j] = det.detokenize(filtered_example)
        mystem_analyzer = Mystem(entire_input=False)
        for i in (range(len(df))):
            df["comment"][i] = [
                mystem_analyzer.lemmatize(w) for w in df["comment"][i]
            ]
            df["comment"][i] = list(filter(None, df["comment"][i]))
        for i in range(len(df)):
            for j in range(len(df['comment'][i])):
                df['comment'][i][j] = [
                    word for word in df['comment'][i][j]
                    if not word in self.sum_noise
                ]

        ##########POSTS##############
        for i in (range(len(df))):
            tokenized_example = (tw.tokenize(df["post"][i]))
            filtered_example = [
                word for word in tokenized_example
                if not word in self.sum_noise
            ]
            df["post"][i] = det.detokenize(filtered_example)
        for i in (range(len(df))):
            a = []
            a.append(df['post'][i])
            df["post"][i] = a
        for i in (range(len(df))):
            df["post"][i] = [
                mystem_analyzer.lemmatize(w) for w in df["post"][i]
            ][0]
        for i in range(len(df)):
            df['post'][i] = [
                word for word in df['post'][i] if not word in self.sum_noise
            ]

        return df
Example #6
0
def saveTranslation(document_id):
    document = get_document(document_id)
    sentences = ""
    for i, sentence in enumerate(document.sentences):
        s = sentence.translation[:-4]
        s = s.replace("@@ ", "") .replace("'", "'") .replace(""", '"')

        from nltk import word_tokenize
        s = word_tokenize(s)

        from nltk.tokenize.treebank import TreebankWordDetokenizer
        twd = TreebankWordDetokenizer()
        s = twd.detokenize(s)
        sentences = sentences + s
        sentences = sentences + "\n"

    print(sentences)

    import datetime
    import time
    time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')

    path = EXPORT_TRANSLATIONS_FOLDER
    fileName = "translation_" + get_jwt_identity() + "_" + time + "_" + str(document_id) + ".txt"
    if not os.path.exists(path):
        os.makedirs(path)

    with open(path + "/" + fileName, 'w', encoding='utf-8') as file:
        file.write(sentences)

    return jsonify({})
Example #7
0
def uncased_preocess(fin, fout, keep_sep=False, max_len=512):
    tokenizer = ProphetNetTokenizer(vocab_file="prophetnet.tokenizer")
    fin = open(fin, 'r', encoding='utf-8')
    fout = open(fout, 'w', encoding='utf-8')
    twd = TreebankWordDetokenizer()
    for line in tqdm.tqdm(fin.readlines()):
        line = line.strip().replace('``', '"').replace('\'\'',
                                                       '"').replace('`', '\'')
        s_list = [
            twd.detokenize(x.strip().split(' '), convert_parentheses=True)
            for x in line.split('<S_SEP>')
        ]
        if keep_sep:
            output_string = " [X_SEP] ".join(s_list)
        else:
            output_string = " ".join(s_list)
        encoded_string = tokenizer(output_string,
                                   return_attention_mask=True,
                                   max_seq_len=max_len)
        ids, attention_mask_ids = encoded_string[
            "input_ids"][:max_len], encoded_string["attention_mask"][:max_len]
        output_string = "$1$".join([
            " ".join([str(i) for i in ids]),
            " ".join([str(i) for i in attention_mask_ids])
        ])
        fout.write('{}\n'.format(output_string))
Example #8
0
def clean_text(text):
    # lower
    text = text.lower()

    # Remove Punctuation
    punct = string.punctuation
    trantab = str.maketrans(punct, len(punct) * ' ')
    text = text.translate(trantab)

    # Remove Digits
    text = re.sub('\d+', '', text)

    # Remove Urls
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)

    # Replace emojis with unicode, Separate emojis
    text = ''.join(UNICODE_EMOJI[c][1:-1] + " " if (c in UNICODE_EMOJI) else c
                   for c in text)

    tokens = word_tokenize(text)
    porter = PorterStemmer()
    detokenizer = TreebankWordDetokenizer()

    # Remove StopWords and Stemmize
    stopwords_list = stopwords.words('english')
    whitelist = ["n't", "not", "no"]  # Not to remove
    tokens = [
        porter.stem(t) for t in tokens
        if ((t not in stopwords_list) or (t in whitelist))
    ]
    text = detokenizer.detokenize(tokens)

    return text
def prepare_data(gens, refs):
    
    sorted_gens = sorted(gens, key=lambda k: (k['image_id'], k['round_id']))
    sorted_refs = sorted(refs, key=lambda k: (k['image_id'], k['round_id']))
    offset = 1 if (len(gens) == len(refs)) else int(len(sorted_gens)/len(sorted_refs)) 
    dt = TreebankWordDetokenizer()
    
    generations, references = [], []
    for i, refs in enumerate(sorted_refs): 
        sys.stdout.write('\r{}/{} --> {:3.1f}%'.format(str(i+1), str(len(sorted_refs)), (i+1)/float(len(sorted_refs))*100))
        sys.stdout.flush()
        if offset == 1:
            gens = sorted_gens[i]
        else:
            gens = sorted_gens[i * offset + refs['round_id'] ]

        # ensure gens and refs correspond to same image/round
        assert (gens['image_id'] == refs['image_id'])
        assert (gens['round_id'] == refs['round_id'])
       
        # list of generated answers (can be multiple generated answer per entry)
        generations.append( [dt.detokenize(word_tokenize(a_gen)) for a_gen in gens['generations'] ] )
        
        # list of references answers
        references.append( refs['refs'] )
        #references.append( [dt.detokenize(word_tokenize(a_ref)) for a_ref in refs['refs']] )
    
    sys.stdout.write('\n')  
    return generations, references
Example #10
0
    def _preprocess_questions(self):
        """
        Preprocesses clean words to create blanked questions
            using all clean words
        """

        self._questions = dict()

        # all possible words that can be used as answers
        clean_words = [
            word.lower() for word in self._words if self._is_clean(word)
        ]
        dt = TWD()

        for word in clean_words:
            # use lowercase for better equality check
            lower_words = [word.lower() for word in self._words]
            # don't use lower_words to preserve capitalization
            words_copy = self._words.copy()
            # put a blank in place of the word
            for index in [
                    index for index, value in enumerate(lower_words)
                    if value == word
            ]:
                words_copy[index] = "_____"

            self._questions[word] = dt.detokenize(words_copy)
Example #11
0
def remix(lyric):
    t = TweetTokenizer()
    d = TreebankWordDetokenizer()

    words = t.tokenize(lyric)
    r1 = random.randint(0, len(words) - 1)
    # filter out punctuation, stop words, and words with no rhymes
    while words[r1] in string.punctuation or isStopWord(words[r1]) or len(
            pronouncing.rhymes(words[r1])) == 0:
        if len(words) == 1: return lyric
        r1 = random.randint(0, len(words) - 1)

    # this is the word to be replaced
    word = words[r1]
    syl = countSyllables(word)

    # find rhymes with same number of syllables
    rhymes = pronouncing.rhymes(word)
    r2 = random.randint(0, len(rhymes) - 1)
    count = 1
    while (count <= len(rhymes)
           and syl != countSyllables(rhymes[r2])) or isStopWord(rhymes[r2]):
        r2 = random.randint(0, len(rhymes) - 1)
        count += 1
    words[r1] = rhymes[r2]

    return d.detokenize(words)
Example #12
0
def clean_text(text):
    # text preprocessing
    # reference https://www.kdnuggets.com/2018/11/text-preprocessing-python.html

    # convert to lower case
    text = text.lower()

    # remove non-alphanumeric
    # preserve spaces and @
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)

    # remove bounding spaces
    text = text.strip()

    # tokenize
    tokens = word_tokenize(text)

    # remove stop words
    # stemming
    stopWords = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(x) for x in tokens if x not in stopWords]

    # detokenize
    detokenizer = TreebankWordDetokenizer()
    text = detokenizer.detokenize(tokens)

    return text
Example #13
0
def preocess(fin, fout, keep_sep=False, max_len=512):
    """Preprocess the raw text file by using bert tokenizer and generate the bpe
       files

    Args:
        fin (str): input raw text file path
        fout (str): output bpe file path
        keep_sep (bool, optional): indicates if the output strings will be
            joined with [X_SEP]. Defaults to False.
        max_len (int, optional): max input sentence length. Defaults to 512.
    """
    fin = open(fin, 'r', encoding='utf-8')
    fout = open(fout, 'w', encoding='utf-8')
    twd = TreebankWordDetokenizer()
    bpe = BertTokenizer.from_pretrained('bert-base-uncased')
    for line in tqdm.tqdm(fin.readlines()):
        line = line.strip().replace('``', '"').replace('\'\'', '"').replace(
            '`', '\'')
        s_list = [twd.detokenize(x.strip().split(
            ' '), convert_parentheses=True) for x in line.split('<S_SEP>')]
        tk_list = [bpe.tokenize(s) for s in s_list]
        output_string_list = [" ".join(s) for s in tk_list]
        if keep_sep:
            output_string = " [X_SEP] ".join(output_string_list)
        else:
            output_string = " ".join(output_string_list)
        output_string = " ".join(output_string.split(' ')[:max_len-1])
        fout.write('{}\n'.format(output_string))
def spacy_entity_extraction(content):
    try:
        from nltk import word_tokenize
        import spacy
        nlp = spacy.load('en_core_web_md')
        capitalized_text = []
        tokenized_words = word_tokenize(content)
        for text in tokenized_words:
            capitalize_first_char = text.capitalize()
            capitalized_text.append(capitalize_first_char)
        detokenizer = Detok()
        detokenized_text = detokenizer.detokenize(capitalized_text)
        #remove_cardinal = re.sub(r'[0-9]+', '', detokenized_text)
        nlp_document = nlp(detokenized_text)
        str_replace_dict = {}
        if len(nlp_document.ents) == 0:
            str2 = detokenized_text
        else:
            for entities in nlp_document.ents:
                extracted_entities = {entities.label_}
                if 'CARDINAL' not in extracted_entities:
                    extracted_text = {entities.text}
                    #print(extracted_text)
                    #print(extracted_text)
                    for key in extracted_text:
                        str_replace_dict[
                            key] = "<span class='imp'>" + key + '</span>'
            str2 = multiwordReplace(detokenized_text, str_replace_dict)
        return str2
    except Exception as e:
        error_updation.exception_log(e, "Error in entities_extraction :",
                                     str(''))
Example #15
0
def find_verb_fillers(sentences, verb, nlp_pipeline, valence=2):
    """extract all verb fillers occurences for a given valence.

    Args:
        sentences ([list of list of str]): list of tokenized sentences.
        verb (str): target verb
        nlp_pipeline (spacy.nlp): a spacy nlp pipeline instance
        valence (int, optional): number of verb arguments. Defaults to 2.

    Returns:
        list of tuples (sentence, HanksVerb)
    """

    fillers = []

    # this tokenization step is necessary for Spacy since the pipeline takes a string as input
    detokenizer = TreebankWordDetokenizer()
    joined_sents = [detokenizer.detokenize(sent) for sent in sentences]

    for sentence, joined_sent in zip(sentences, joined_sents):
        doc = nlp_pipeline(joined_sent)

        # find target verb occurences in the sentence
        target_verbs = find_target_verbs(doc, verb)

        # loop trough target verb occurences
        for target_verb in target_verbs:

            # retrieve verb fillers from syntactic dependencies
            hank_verb = get_hanks_verb(target_verb)
            # check for valence
            if hank_verb.nargs == valence:
                fillers.append((sentence, hank_verb))
    return fillers
Example #16
0
def untokenizer(sentence):
    """ Takes the non-annotated sentences and convert them to string"""
    if isinstance(sentence, list):
        detkn = TreebankWordDetokenizer()
        sentence = detkn.detokenize(sentence)
        return sentence
    else:
        return sentence
Example #17
0
def detokenize(lyric):
    detokenizer = Detok()
    detoken_list = []
    while lyric:
        for list_item in lyric:
            text = detokenizer.detokenize(list_item)
            detoken_list.append(text)
        return detoken_list
Example #18
0
class Vocab:
    def __init__(self, config):
        self.idx2word = {config.pad_idx: "<pad>", config.bos_idx: "<bos>", config.eos_idx: "<eos>", config.unk_idx: "<unk>"}
        self.word2idx = {}
        for k, v in self.idx2word.items():
            self.word2idx[v] = k
        self.vocab_size = len(self.idx2word)
        self.ontology_size = self.vocab_size
        self.detokenizer = TreebankWordDetokenizer()

    def add(self, sentence, word=False):
        if word:
            token = sentence
            if not self.word2idx.get(token):
                self.word2idx[token] = self.vocab_size
                self.idx2word[self.vocab_size] = token
                self.vocab_size += 1
        else:
            sequence = sentence.lower().split()
            for token in sequence:
                if not self.word2idx.get(token):
                    self.word2idx[token] = self.vocab_size
                    self.idx2word[self.vocab_size] = token
                    self.vocab_size += 1

    def encode(self, sentence, word=False):
        if word:
            token = sentence.lower()
            encoded = [self.word2idx["<bos>"]]
            encoded.append(self.word2idx[token] if self.word2idx.get(token) else self.word2idx["<unk>"])
            encoded.append(self.word2idx["<eos>"])
        else:
            sequence = sentence.lower().split()
            encoded = [self.word2idx["<bos>"]]
            for token in sequence:
                encoded.append(self.word2idx[token] if self.word2idx.get(token) else self.word2idx["<unk>"])
            encoded.append(self.word2idx["<eos>"])

        return encoded

    def decode(self, sequence):
        decoded = []
        for token in sequence:
            decoded.append(self.idx2word[token])
        decoded = self.detokenizer.detokenize(decoded)
        decoded = re.sub(r"([a-zA-Z0-9]) ([\.\,\!\?])", r"\1\2", decoded)
        return decoded

    def load(self, save_path):
        idx2word = json.load(open(os.path.join(save_path, "idx2word.json"), "r"))
        word2idx = json.load(open(os.path.join(save_path, "word2idx.json"), "r"))
        self.word2idx.update(word2idx)
        for k, v in idx2word.items():
            self.idx2word[int(k)] = v
        self.vocab_size = len(self.word2idx)

        vocab_config = json.load(open(os.path.join(save_path, "vocab_config.json"), "r"))
        self.ontology_size = vocab_config["ontology_size"]
Example #19
0
class TreebankEncoder(StaticTokenizerEncoder):
    """ Encodes the text using the Treebank tokenizer.

    Tokenization Algorithm Reference:
    http://www.nltk.org/_modules/nltk/tokenize/treebank.html

    Args:
        sample (list of strings): Sample of data to build dictionary on
        min_occurrences (int, optional): Minimum number of occurrences for a token to be added to
          dictionary.
        append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector.

    Example:

        >>> encoder = TreebankEncoder(["This ain't funny.", "Don't?"])
        >>> encoder.encode("This ain't funny.")
         5
         6
         7
         8
         9
        [torch.LongTensor of size 5]
        >>> encoder.vocab
        ['<pad>', '<unk>', '</s>', '<s>', '<copy>', 'This', 'ai', "n't", 'funny', '.', 'Do', '?']
        >>> encoder.decode(encoder.encode("This ain't funny."))
        "This ain't funny."

    """
    def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError(
                'TreebankEncoder defines a tokenize callable TreebankWordTokenizer'
            )

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise

        self.detokenizer = TreebankWordDetokenizer()

        super().__init__(*args,
                         **kwargs,
                         tokenize=TreebankWordTokenizer().tokenize)

    def decode(self, tensor):
        tokens = [self.itos[index] for index in tensor]
        return self.detokenizer.detokenize(tokens)
Example #20
0
def sneak():
    twd = TreebankWordDetokenizer()
    print(brown.sents(categories=['adventure', 'mystery']))
    sent = brown.sents(categories=['adventure', 'mystery'])
    for s in sent:
        load_sentence(twd.detokenize(s))


# sneak()
Example #21
0
def detok(input):
    tokens = input.split()
    detokenizer = Detok()
    text = detokenizer.detokenize(tokens)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '. ', text)
    text = re.sub('\s*\?\s*', '? ', text)
    text = text.strip()

    return text
Example #22
0
    def docx2csv(directory, label='yes', name="DOCX.csv"):
        from docx import Document
        home = os.getcwd()
        os.chdir(directory)
        data = []
        for filename in os.listdir(os.getcwd()):
            try:
                name, file_extension = os.path.splitext(filename)
                if '.docx' in file_extension or '.DOCX' in file_extension:
                    data.append(filename)
            except ValueError:
                continue

        #READ each document
        book = []
        for file in data:
            document = Document(file)
            lines = []

            #convert DOCX to text
            for p in document.paragraphs:
                text = p.text
                #text = text.encode('utf-8')
                lines.append(text)
            entries = []

            #detokenize
            d = TreebankWordDetokenizer()
            bodytext = d.detokenize(lines)
            bodytext = str(bodytext)

            #sever the unwanted head and tail
            bodytext, sep, tail = bodytext.partition(
                "Classification   Language:")
            head, sep, bodytext = bodytext.partition("Body ")

            #add label
            entries.append(label)

            #clean encodings
            #bodytext = bodytext.decode("utf8")
            bodytext = bodytext.replace("\\xc2\\xa0", "")
            bodytext = bodytext.replace("\\xc2", "")
            entries.append(bodytext.encode("utf-8"))
            #print(entries)

            #append to the book
            book.append(entries)

        os.chdir(home)
        #write to CSV file
        with open("preprocess/" + name, "w") as data:
            writer = csv.writer(data)
            writer.writerows(book)
class Tokenizer:
    def __init__(self):
        self._slot_delimiters = ("{", "}")
        self._regex_slot = re.compile(
            r"{}+\w+{}".format(*self._slot_delimiters))
        self._wp_tokenizer = WordPunctTokenizer()
        self._detokenizer = TreebankWordDetokenizer()
        self._slot_placeholder = "THISISASLOT_"

    def tokenize_sequence(self, string_sequence: str) -> List[Token]:
        def check_slots(text: str) -> bool:
            if self._regex_slot.search(text):
                return True
            return False

        string_tmpl_processed = string_sequence
        slot_counter = 0
        slot_placeholders = {}
        # find all slots and replace with placeholders
        while check_slots(string_tmpl_processed):
            placeholder = self._slot_placeholder + str(slot_counter)
            slot_key = self._regex_slot.findall(string_tmpl_processed)[0]
            string_tmpl_processed = self._regex_slot.sub(
                placeholder, string_tmpl_processed, 1)
            slot_placeholders[placeholder] = slot_key
            slot_counter += 1
        logger.debug("processed string: {}".format(string_tmpl_processed))
        logger.debug("slots found: {}".format(slot_placeholders))
        # tokenize the processed string
        string_tmpl_tokenized = self._wp_tokenizer.tokenize(
            string_tmpl_processed)
        logger.debug("tokenized sequence: {}".format(string_tmpl_tokenized))
        out = []
        for t in string_tmpl_tokenized:
            token_obj = Token()
            if t.startswith(self._slot_placeholder):
                logger.debug("slot token object {}".format(t))
                token_obj.text = slot_placeholders[t][1:-1]
                token_obj.is_placeholder = True
            else:
                logger.debug("text token object {}".format(t))
                token_obj.text = t
                token_obj.is_placeholder = False
            out.append(token_obj)

        return out

    def detokenize(self, source: List[Token]) -> str:
        text_tokens = [
            (("{" + token.text + "}") if token.is_placeholder else token.text)
            for token in source
        ]
        return self._detokenizer.detokenize(text_tokens)
Example #24
0
def detokenize(line):
    """
    Detokenizes the processed CNN/DM dataset to recover the original dataset,
    e.g. converts "-LRB-" back to "(" and "-RRB-" back to ")".
    """
    line = line.strip().replace("``", '"').replace("''", '"').replace("`", "'")
    twd = TreebankWordDetokenizer()
    s_list = [
        twd.detokenize(x.strip().split(" "), convert_parentheses=True)
        for x in line.split("<S_SEP>")
    ]
    return " ".join(s_list)
Example #25
0
    def _prepare_for_poet(self):
        """ Tokenize, get rid of punctuation, lowercase-ify """
        TOKENIZE_PATTERN = re.compile(r"[a-zA-Z]+-?'?[a-zA-Z]*")

        text = self.context
        if isinstance(self.context, nltk.Text):
            # detokenize if nltk text passed in
            twd = TreebankWordDetokenizer()
            text = twd.detokenize(self.context)

        self.tokens = [w.lower() for w in TOKENIZE_PATTERN.findall(text)]
        self.text = nltk.Text(self.tokens)
def remove_NLTK_stop1(all_data):
    sw = stopwords.words('english')
    deto = Detok()

    all_cleaned = list()

    for article in all_data:
        word_tokens = word_tokenize(article)
        all_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    return all_cleaned
Example #27
0
def load_brown_corpus() -> List[str]:
    nltk.download('brown')
    sentences = list(
        filter(
            lambda sent: (len(sent) <= 30) and
            (len(sent) >= 3) and any(map(lambda word: word.isalpha(), sent)),
            brown.sents()))
    mdetok = TreebankWordDetokenizer()
    return list(
        map(
            lambda sent: mdetok.detokenize(
                (' '.join(sent).replace('``', '"').replace("''", '"').replace(
                    '`', "'")).split()), sentences))
Example #28
0
def textify(tokens):
    """
    De-tokenizes a list of tokens back into normal text.

    Args:
        tokens: List of tokens

    Returns:
        String of text
    """
    d = TreebankWordDetokenizer()
    tokens = filter(lambda x: x != "", map(revert_ctrl_token, tokens))
    return d.detokenize(tokens).replace("\n ", "\n")
Example #29
0
def getSubSentenceList(sentence1, sentence2, set1, set2):
    # obtain the diff words
    (set1, set2) = wordDiffSet(sentence1, sentence2)

    # generate sub sentences
    subsentL1 = []
    subsentL2 = []

    removeIdx1 = []
    removeIdx2 = []

    tokenizer = TreebankWordTokenizer()
    detokenizer = TreebankWordDetokenizer()

    sentence1L = tokenizer.tokenize(sentence1)
    sentence2L = tokenizer.tokenize(sentence2)

    for idx, word in enumerate(sentence1L):
        if word in set1:
            removeIdx1.append(idx)

    for idx, word in enumerate(sentence2L):
        if word in set2:
            removeIdx2.append(idx)

    for idx in removeIdx1:
        tokens = tokenizer.tokenize(sentence1)
        tokens.pop(idx)
        subsent = detokenizer.detokenize(tokens)
        subsentL1.append(subsent)

    for idx in removeIdx2:
        tokens = tokenizer.tokenize(sentence2)
        tokens.pop(idx)
        subsent = detokenizer.detokenize(tokens)
        subsentL2.append(subsent)

    return (subsentL1, subsentL2)
Example #30
0
def text_random_crop(text, crop_by: str = 'word', crop_ratio: float = 0.1):
    if crop_by == 'word':
        seq = nltk.word_tokenize(text)
    elif crop_by == 'sentence':
        seq = nltk.sent_tokenize(text)
    else:  # char
        seq = text
    size = len(seq)
    chop_size = size // (1 / crop_ratio)
    chop_offset = random.randint(0, int(chop_size))
    cropped = seq[chop_offset:size - chop_offset - 1]

    d = TreebankWordDetokenizer()
    return d.detokenize(cropped)
Example #31
0
#from six.moves import xrange
import os, sys
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer # nltk 3.3
from truecaser.Truecaser import *
import _pickle as cPickle

def convert(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
    #tokens = nltk.word_tokenize(sentence)
    return getTrueCase(tokens, 'as-is', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

# load a truecase model$
print("load truecaser", file=sys.stderr)
#curr_dir = os.path.dirname(__file__)
f = open("truecaser/distributions.obj", 'rb')
uniDist = cPickle.load(f)
backwardBiDist = cPickle.load(f)
forwardBiDist = cPickle.load(f)
trigramDist = cPickle.load(f)
wordCasingLookup = cPickle.load(f)
f.close()

if __name__ == "__main__":
    sent = "I do n't have cats named Tom and Jerry ."
    tokens = [x.lower() for x in sent.split()]
    print(tokens)
    truecase_tokens = convert(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
    detokenizer = TreebankWordDetokenizer()
    sent = detokenizer.detokenize(truecase_tokens)
    print(sent)