Example #1
0
def find_first_n_keywords(df, header, n, char_list):
    stop_words = set(stopwords.words('english')) 
    column = df[header]
    first_n_list = []
    
    for text_row in column:
        word_tokens = get_unique_word(text_row)
        # word_tokens = word_tokenize(text_row) 
        filtered = [w for w in word_tokens if not w in stop_words]
        filtered_char = [w for w in filtered if not w in char_list] 
        filtered_space = [w for w in filtered_char if w != ''] 
        first_n_row = filtered_space[:n]
        first_n_row = TreebankWordDetokenizer().detokenize(first_n_row)
        first_n_list.append(first_n_row)
    df['first_'+str(n)+'_keywords_in_'+header] = first_n_list
    return df    
Example #2
0
    def preprocess_data(self):
        """
        clean sentences and normalize them, transform sentences to sequence of integers
        """
        # eliminate stop words and punctuation from data
        words, classes, documents = [], [], []
        # loop through each sentence in our intents patterns
        for intent in self.data['intents']:
            if 'patterns' in intent:
                for pattern in intent['patterns']:
                    # tokenize each word in the sentence
                    pattern = pattern.lower()
                    tokens = WordPunctTokenizer().tokenize(pattern)
                    filtered_words = [
                        w for w in tokens if w not in self.stop_words
                    ]
                    words.extend(filtered_words)
                    sentence = TreebankWordDetokenizer().detokenize(
                        filtered_words)
                    # add to documents in our corpus
                    documents.append((sentence, intent['tag']))
                    # add to our classes list
                    if intent['tag'] not in classes:
                        classes.append(intent['tag'])

        # save dictionary of words
        repository = words
        for i in words:
            if not i.isalpha():
                repository.remove(i)

        # separate sentences (input) and labels (output)
        text, output = [], []
        for doc in documents:
            text.append(doc[0])
            output.append(classes.index(doc[1]))

        self.input = text  # save input
        self.output = np.asarray(output)  # save output
        self.classes = classes  # save target classes
        self.words = Counter(repository)  # save words dictionary

        self.transform_text_to_numeric(
        )  # transform sentences to sequence of integers
        self.transform_numeric_with_embeddings(
        )  # create embedding matrix with words in dictionary
Example #3
0
def change_genderwords(text):
    newtext = ""
    for sentence in tokenize.sent_tokenize(text):
        token = tokenize.word_tokenize(sentence)
        for count, tokens in enumerate(token):
            for words in gender_word:
                if words[0] == tokens:
                    token[count] = words[1]
                elif words[1] == tokens:
                    token[count] = words[0]
                elif words[0] + '\'s' == tokens:
                    token[count] = words[1] + '\'s'
                elif words[1] + '\'s' == tokens:
                    token[count] = words[0] + '\'s'
        detoken = TreebankWordDetokenizer().detokenize(token)
        newtext = newtext + detoken + ' '
    return newtext
Example #4
0
def average_sentlength( tokens ):
    """
    Calculates the sentence word length.
    len(nltk.word_tokenize(sent)) -> length of each sentence, tokenized individually
    (sum(len(nltk.word_tokenize(sent)) for sent in sent_tokens)) -> sum of all sentence lengths
    :param tokens:  the tokenized list of words
    :return:        average sentence length
    """
    #detokenize the tokenized list --> rebuilding the sentences
    original_text = TreebankWordDetokenizer().detokenize( tokens )
    #use the sentence tokenizer
    sent_tokens = nltk.sent_tokenize( original_text )

    if len(sent_tokens) != 0:
        return ( sum(len(nltk.word_tokenize( sent )) for sent in sent_tokens) ) / len(sent_tokens)
    else: 
        return 0
Example #5
0
 def create_subtitle_features_df(self, subtitle_dir):
     """
     Extract features (see engineer_features()) from subtitles for specific movie
 
     :param: imdb_id for movie, subtitle corpus directory
     :returns: Pandas dataframe with shape (n_words, n_features)
     """
     features_list = []
     features_df = pd.DataFrame()
     files = os.listdir(subtitle_dir)
     list_of_text_files = []
     for file in files:
         if re.search(self.imdb_id, file):
             list_of_text_files.append(file)
     sent_n = 0
     sent_per_episode = int(50 / len(list_of_text_files))
     for episode, file in enumerate(list_of_text_files):
         filename = subtitle_dir / file
         with open(filename, 'r') as subtitles:
             texts = subtitles.read()
         subtitles.close()
         sents_all = SENT_TOKENIZER.tokenize(texts)
         window_size = 3
         if type(sents_all[:-2]) == list and len(
                 sents_all[:-2]) >= sent_per_episode:
             sents = np.random.choice(sents_all[:-2],
                                      sent_per_episode)  # last 2 are noise
             for itext in range(0, len(sents), window_size):
                 text_window = sents[itext:(itext + window_size)]
                 text_window_raw = TreebankWordDetokenizer().detokenize(
                     text_window)
                 arguments = text_window_raw, str('_'), 'episode' + str(
                     episode) + '_tw' + str(itext), 'movie'
                 rt = SingleTextProcessor(*arguments)
                 if len(rt.sentences) > 2:
                     rt.process_self()
                     feature_dict = rt.to_dict()
                     features = engineer_features(feature_dict)
                     features_df = features_df.append(features,
                                                      ignore_index=True)
                     sent_n += window_size
                     print('----------\n\n------ PROCESSING SENTENCE',
                           sent_n, 'in episode', episode + 1, 'out of',
                           len(list_of_text_files) + 1,
                           '------\n\n----------')
     self.subtitle_features = features_df
def SpellCheck(data):
    Spell_Words = []
    spell = SpellChecker()
    words = spell.split_words(words)
    for i in data.split_words(' '):
        w = Word(i)
        spell.word_frequency.load_words(['molded','.', '(',')'])
        words = spell.correction(w)
        if words != w:
            words = colored(words, 'blue')

        #spell_word = ' '.join(words)
        Spell_Words.append(words)

    # print(Spell_Words)
    Corrected_Words = TreebankWordDetokenizer().detokenize(Spell_Words)
    return Corrected_Words
def SpellCheck2(data):
    spell = SpellChecker()
    Spell_Words = []
# Note that this does not necessarily deal with punctuation unless you provide
# a custom tokenizer
    words_split = nltk.word_tokenize(data) 
    # misspelled = spell.unknown(words_split)
    for word in words_split:
        spell.word_frequency.load_words(['molded','.', '(',')'])
        correction = spell.correction(word)
        print(correction)
        if correction != word:  
            correction = colored(correction, 'blue')
        Spell_Words.append(correction)
    
    Corrected_Words = TreebankWordDetokenizer().detokenize(Spell_Words)
    return Corrected_Words
def preprocess(df, full_processing=False):
    # Tokenize review text
    df['reviewText'] = df['reviewText'].apply(word_tokenize)

    # Remove noise
    df['reviewText'] = df['reviewText'].apply(remove_noise)

    # Perform full processing, if needed
    if full_processing:
        df['reviewText'] = df['reviewText'].apply(filter_stopwords)
        df['reviewText'] = df['reviewText'].apply(lemmatize)

    # Detokenize text
    df['reviewText'] = df['reviewText'].apply(
        TreebankWordDetokenizer().detokenize)

    return df.to_numpy()
Example #9
0
def search_news():
    """
    This endpoint is a (POST) HTTP method, it takes a json that consists
    of a key value pair - keywords that holds an array of non empty
    keyword of type string. sample is shown below
    {
        "keywords": [ "one", "two", "three" ]
    }
    """

    query_input = request.get_json()
    array_input = query_input['keywords']

    if (not query_input or not array_input):
        return response.bad_request(message='not a valid input')

    if (type(array_input) != list):
        return response.bad_request(message='the keywords is of type array')

    if (array_input[0].strip() == ''):
        return response.bad_request(
            message='array must contain a non empty string')

    detokenized_words = TreebankWordDetokenizer().detokenize(array_input)
    identifier = '_'.join(array_input)
    identifier = identifier.lower()
    news_holder = []
    identifier = str(identifier)
    query = {"identifier": identifier}
    not_included = {"_id": 0, "identifier": 0}
    try:
        news_from_db = mongo.db.news.find(query, not_included)
        if news_from_db:
            for obj in news_from_db:
                news_holder.append(obj)
            if len(news_holder) > 0:
                news_holder.sort(key=constant.get_my_key, reverse=True)
                print('!!!!!!! got data from db !!!!!!!')
                return response.success(data=news_holder)
    except Exception as err:
        print(err)
        return response.internal_server_error(
            message=f'error occured while querying from db - {err}')
    print('!!!!!!!! got data from internet !!!!!!!!!!!')
    return get_news_from_internet(detokenized_words, identifier, array_input)
Example #10
0
def punctuation(para):
    fin = []
    #words = nltk.tokenize.word_tokenize(para)
    words = re.split('[ ]', para)
    cap_sugg = {}
    for ind in range(len(words)):
        word = words[ind]
        flag = 0
        ch = ''
        if (word == ""):
            continue
        if (word[-1] == '.' or word[-1] == '?' or word[-1] == '!'):
            flag = 1
            ch = word[-1]
            word = word[:-1]
        sug = word
        if (ut.tag([word])[0][1] is None):
            c = word[0]
            word1 = word[0].upper() + word[1:]
            cap_sugg[ind] = word1
            sug = word1
        sug = sug + ch
        fin.append(sug)
    para = TreebankWordDetokenizer().detokenize(fin)
    print(para)
    arr = re.split('[?.!]', para)
    # print(arr)
    newpara = []
    for line in arr:
        line = line.lstrip()
        line = line.rstrip()
        if (line == ''):
            continue
        if (is_question(line)):
            newline = line[0].upper() + line[1:len(line)]
            newpara.append(newline + '?')
        else:
            newline = line[0].upper() + line[1:len(line)]
            newpara.append(newline + '.')
    if (newpara[len(newpara) - 1] == '.'):
        newpara = newpara[:len(newpara) - 1]
    str = ' '.join(newpara)
    # print(str)
    # print(str)
    return str
Example #11
0
def abstractive_summary():
    os.system("python make_datafiles.py")
    os.system("python run_summarization.py --mode=decode --data_path=finished_files/test.bin --vocab_path=vocab --log_root=logs --exp_name=myexperiment")

    with open('logs/myexperiment/decode/attn_vis_data.json') as json_file:
        data = json.load(json_file)
        text = (TreebankWordDetokenizer().detokenize(data["decoded_lst"]))
        
        text = text.replace(" . ", ". ").replace(" , ", ", ").replace(" ; ", "; ").replace(" \' s", "\'s").replace(" “ ", " “").replace(" ” ", "” ")

        sentences = sent_tokenize(text)
        sentences = [s.capitalize().strip() for s in sentences]

        sentences = list(set(sentences))

        text = " ".join(sentences).strip()
       
        return text
Example #12
0
    def get_function(self):
        dtk = TreebankWordDetokenizer()

        def lsa(array):
            array = pd.Series(array, index=pd.Series(array.index), name='array')
            copy = array.dropna()
            copy = copy.apply(lambda x: dtk.detokenize(clean_tokens(x)))
            li = self.trainer.transform(copy)
            lsa1 = pd.Series(li[:, 0], index=copy.index)
            lsa2 = pd.Series(li[:, 1], index=copy.index)
            array = pd.DataFrame(array)
            array['l1'] = lsa1
            array['l2'] = lsa2

            arr = ((np.array(array[['l1', 'l2']])).T).tolist()
            return pd.Series(arr)

        return lsa
Example #13
0
def extra_text_import():

    news_text = brown.sents(categories="news")
    words = []
    for sent in news_text:
        temp = sent
        words.append(TreebankWordDetokenizer().detokenize(temp))

    remove = string.punctuation + string.digits
    remove = remove.replace(",", "")
    remove = remove.replace(".", "")
    # print("patterns to remove", remove)
    table = str.maketrans("", "", remove)

    words = [w.lower() for w in words]
    words = [w.translate(table) for w in words]

    return words #[0:1]
Example #14
0
def extractgender(readacted_date):
    redacted_gender_files=[]
    countgender =0
    newls=[]
    gender=['mr.','sir','his','mister','mr','prince','king','mrs.','ms.','miss','her','lady','madameoiselle','baroness','mistress','mrs','ms','queen','princess','madam','madame']
    for i in range(len(readacted_date)):
        tokenize=word_tokenize(readacted_date[i])
        for n,i in enumerate(tokenize):
            for j in range(len(gender)):
                if i.lower() == gender[j]:
                    tokenize[n] = '██'
                    countgender+=1
        file = TreebankWordDetokenizer().detokenize(tokenize)
        

        redacted_gender_files.append(file)  

    return redacted_gender_files,countgender
def reinstate_abbreviation_expansion(sentence, abbrevs):
    # abbrevs = {key + "_1": value for key, value in abbrevs.items()}
    tokens_text = word_tokenize(sentence)
    abbrev_data = find_all_abbreviations(sentence, abbrevs)
    if len(abbrev_data) == 0:
        return sentence
    abbrev_data.sort(key=lambda x: x.index)

    result_tokens = []
    start = 0
    for abbrev in abbrev_data:
        start_tokens = tokens_text[start:abbrev.index]
        start_tokens.append(abbrev.construct_abbrev_with_expansion())
        result_tokens.extend(start_tokens)
        start = abbrev.index + 1
    result_tokens.extend(tokens_text[start:])

    return TreebankWordDetokenizer().detokenize(result_tokens)
Example #16
0
def prepn(inp):
    from nltk.corpus import stopwords 
    from nltk.tokenize import word_tokenize 

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(inp) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    from nltk.tokenize.treebank import TreebankWordDetokenizer
    return TreebankWordDetokenizer().detokenize(filtered_sentence)
def index(request):

    if 'inp' not in request.POST:
        sentence = ""
    else:
        sentence = request.POST['inp']

    if 'spells' not in request.POST:
        spellid = '-1'
    else:
        spellid = request.POST['spells']

    ignore = request.session.get('ignore', [])

    if 'ignoreid' in request.POST:
        ignore.append(request.POST['ignoreid'])
        spellid = request.POST['ignoreid']
        request.session['ignore'] = ignore

    tokens = nltk.word_tokenize(sentence)

    if 'change' in request.POST:
        logging.debug(request.POST['change'])
        spellid = request.POST['changeid']
        tokens[int(spellid)] = request.POST['change']
        sentence = TreebankWordDetokenizer().detokenize(tokens)

    spells = spell_suggestions(tokens)

    for i in ignore:
        spells[int(i)] = []

    if int(spellid) not in spells:
        spells = []
    else:
        spells = spells[int(spellid)]

    return render(
        request, 'rewrite_app/index.html', {
            'sentence': sentence,
            'spells': spells,
            'tokens': tokens,
            'spellid': spellid
        })
def pre_processing(filename='data.csv'):
    print('loading and tokenizing data ....')
    train_data = pd.read_csv(filename, index_col=0)
    print(train_data.head())
    print(train_data.describe())
    print(train_data.info())
    nltk.download('punkt')
    nltk.download('stopwords')
    en_stopwords = stopwords.words('english')
    detokenizer = TreebankWordDetokenizer()

    def clean_description(desc):
        desc = word_tokenize(desc.lower())
        desc = [
            token for token in desc
            if token not in en_stopwords and token.isalpha()
        ]
        return detokenizer.detokenize(desc)

    train_data['review'] = train_data['features'].apply(clean_description)
    train_data.dropna(inplace=True)

    #     # target_1_values = set(df['continuous_target_1'])
    #     # >> > target_1_values
    #     # {80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100}
    def points_to_class(points):
        if points in range(80, 83):
            return 0
        elif points in range(83, 87):
            return 1
        elif points in range(87, 90):
            return 2
        elif points in range(90, 94):
            return 3
        elif points in range(94 - 98):
            return 4
        else:
            return 5

    train_data['label'] = train_data['continuous_target_1'].apply(
        points_to_class)

    return train_data[["review", "label"]]
Example #19
0
def sampleCharacter(text, char, n, l):
    indices = []
    for i in range(len(text)):
        if text[i] == char:
            indices.append(i)
    master = {}
    while n > 0 and len(indices) > 0:
        x = indices[random.randint(0, len(indices) - 1)]
        start = int(x - l / 2)
        if start < 0:
            start = 0
        new = []
        for i in range(l):
            if i + start < len(text):
                new.append(text[i + start])
        master.update({start: TreebankWordDetokenizer().detokenize(new)})
        n = n - 1
        indices.remove(x)
    return master
Example #20
0
def supercase(text):
    shift = "⇧"
    caps = "⇪"
    tokens = word_tokenize(text)
    mod_tokens = []

    for t in tokens:
        #Check if entire word is at least two characters and is entirely uppercased
        if len(t) >= 2 and t == t.upper() and t[0].isalpha():
            mod_tokens.append(t.lower() + caps)
        #Check if upper
        elif t[0].isupper():
            mod_tokens.append(t.lower() + shift)
        #If none of this is true, return original
        else:
            mod_tokens.append(t.lower())

    supercasedtext = TreebankWordDetokenizer().detokenize(mod_tokens)
    return supercasedtext
Example #21
0
    def get_sample_text_passages(self, expression, no_passages):
        """
        Returns a specified number of example passages that include a certain expression.

        The number of passages that you request is a maximum number, and this function may return
        fewer if there are limited cases of a passage in the corpus.

        :param expression: expression to search for
        :param no_passages: number of passages to return
        :return: List of passages as strings

        >>> from gender_analysis.corpus import Corpus
        >>> from gender_analysis.common import TEST_DATA_PATH
        >>> filepath = TEST_DATA_PATH / 'sample_novels' / 'texts'
        >>> corpus = Corpus(filepath)
        >>> results = corpus.get_sample_text_passages('he cried', 2)
        >>> 'he cried' in results[0][1]
        True
        >>> 'he cried' in results[1][1]
        True
        """
        count = 0
        output = []
        phrase = nltk_tokenize.word_tokenize(expression)
        random.seed(expression)
        random_documents = self.documents.copy()
        random.shuffle(random_documents)

        for document in random_documents:
            if count >= no_passages:
                break
            current_document = document.get_tokenized_text()
            for index in range(len(current_document)):
                if current_document[index] == phrase[0]:
                    if current_document[index:index+len(phrase)] == phrase:
                        passage = " ".join(current_document[index-20:index+len(phrase)+20])
                        output.append((document.filename, passage))
                        count += 1


        if len(output) <= no_passages:
            return TreebankWordDetokenizer().detokenize(output)
        return output[:no_passages]
Example #22
0
def allowOnlyCommonWords(df, most_occur):
    #gets the formated dataframe and the array with the most common words
    dfInput = {'Tweet': [], 'Hatespeech': []}
    #Removes all words from the String that are not in the most common words
    for index, row in df.iterrows():
        tweetText = row['Tweet']
        hatespeechIndicator = row['Hatespeech']
        word_tokens = word_tokenize(tweetText)
        filtered_sentence = []
        for w in word_tokens:
            for value in most_occur:
                if value[0] == w:
                    filtered_sentence.append(w)
        dfInput["Tweet"].append(
            TreebankWordDetokenizer().detokenize(filtered_sentence))
        dfInput["Hatespeech"].append(hatespeechIndicator)

    clearedDf = pd.DataFrame(dfInput, columns=['Tweet', 'Hatespeech'])
    return clearedDf
Example #23
0
 def coref_true_to_file(self, data):
     # write the coref results to file
     corefCount = 0
     f = open(self.output_name + "_coref_true.tsv", "w+")
     for line in tqdm(data):
         coref_line = {"document":line.strip()}
         try:
             json = self.predictor.predict_json(coref_line)
         except KeyboardInterrupt:
             print("KeyboardInterrup")
             break
         except:
             print("problem sentence: ", line)
         if len(json['clusters']) > 0:
             corefCount += 1
             f.write(TreebankWordDetokenizer().detokenize(json['document'])+"\n")
     f.close()
     print("Coref count: ", corefCount)
     print("write to file complete")
def get_candidates(model, text, max_candidates):
    # helper function that retrieves perturbed candidates; called in get_delta_opt
    words = word_tokenize(text)
    candidates = [None] * max_candidates
    counter = 0
    for word in words:
        if wn.synsets(word) == []:
            continue
        tmp = wn.synsets(word)[0].pos()
        # if not adjective or noun, continue
        if tmp != "a" and tmp != "n":
            continue
        for a in antonyms(word):
            candidates[counter] = (TreebankWordDetokenizer().detokenize(
                [a.rstrip() if x == word else x for x in words]))
            counter += 1
            if counter >= max_candidates:
                return list(filter(None.__ne__, candidates))
    return list(filter(None.__ne__, candidates))
Example #25
0
def conllReader(corpus):
    '''
    Data reader for CoNLL format data
    '''
    root = "data/"
    sentences = []

    ccorpus = ConllCorpusReader(root, ".conll", ('words', 'pos', 'tree'))

    raw = ccorpus.sents(corpus)

    for sent in raw:
        sentences.append([TreebankWordDetokenizer().detokenize(sent)])

    tagged = ccorpus.tagged_sents(corpus)
    print(tagged)


    return tagged, sentences
Example #26
0
class word_tokenizer(tokenizer):
    def __init__(self):
        self.__pattern = r'''(?x)       
                        (?:[A-Z]\.)+      
                        | \w+(?:-\w+)*     
                        | \$?\d+(?:\.\d+)?%? 
                        | \.\.\.           
                        | [][.,;"'?():_`-]  
                        '''

        self.__tokenizer = RegexpTokenizer(self.__pattern)
        self.__detokenizer = TreebankWordDetokenizer()

    def tokenize(self, text):
        return self.__tokenizer.tokenize(text)

    def detokenize(self, iterable):
        return self.__detokenizer.detokenize(iterable)

    def encode(self, text):
        pass

    def decode(self, iterable):
        pass

    def vocab(self):
        return 0

    def __fit_file(self, file):
        self.source_file = file
        with Fast_File(file) as ff:
            self.__fit_iterable(ff)

    def __fit_iterable(self, it):
        sentencepiece.SentencePieceTrainer.Train(sentence_iterator=it,
                                                 model_writer=self.model)

    def fit(self, x):
        if isinstance(x, str):
            self.__fit_file(x)
        else:
            self.__fit_iterable(x)
Example #27
0
class Cleaner():
    def __init__(self):
        # nltk.download('punkt')
        self.tk = TreebankWordTokenizer()
        self.dtk = TreebankWordDetokenizer()
        self.BAD_CAT_REMOVE = re.compile('^Cat_')
        self.A_TILDE_REMOVE = re.compile('[á]')
        self.E_TILDE_REMOVE = re.compile('[é]')
        self.I_TILDE_REMOVE = re.compile('[í]')
        self.O_TILDE_REMOVE = re.compile('[ó]')
        self.U_TILDE_REMOVE = re.compile('[ú]')
        self.POINT_FOLLOWING_LETTER = re.compile('(?<=\S)\.(?=\w)')
        # self.BAD_SYMBOLS_REMOVE = re.compile('[^A-Za-z0-9_ áéíóú]')

    def applyRegex(self, value, regex, replacement):
        value = regex.sub(replacement, value)
        return value

    def text_cleaning(self, text):
        return pipe(
            text.lower(),
            # partial(self.BAD_SYMBOLS_REMOVE.sub,  ''),
            partial(self.A_TILDE_REMOVE.sub, 'a'),
            partial(self.E_TILDE_REMOVE.sub, 'e'),
            partial(self.I_TILDE_REMOVE.sub, 'i'),
            partial(self.O_TILDE_REMOVE.sub, 'o'),
            partial(self.U_TILDE_REMOVE.sub, 'u'),
            # partial(self.POINT_FOLLOWING_LETTER.sub('. '))
        )

    def sentence_cleaning(self, sentence, detokenize=False):
        word_tokens = pipe(sentence,
                           partial(self.POINT_FOLLOWING_LETTER.sub, '. '),
                           self.tk.tokenize)

        word_tokens = [self.text_cleaning(text) for text in word_tokens]
        # word_tokens.remove('')

        if detokenize:
            return self.dtk.detokenize(word_tokens)
        else:
            return word_tokens
    def spell_correction(self, data_obj, stem_and_stop):
        if not stem_and_stop:
            self.tokenized_corpus = [[word for word in tweet.split()]
                                     for tweet in data_obj.raw_tweets]

        self.tokenized_corpus = [[
            str(TextBlob(word).correct()) for word in tweet
        ] for tweet in self.tokenized_corpus]
        self.detokenized_corpus = [
            TreebankWordDetokenizer().detokenize(tweet)
            for tweet in self.tokenized_corpus
        ]

        self.detokenized_corpus = [
            re.sub('[^A-Za-z0-9 ]', '', tweet)
            for tweet in self.detokenized_corpus
        ]
        self.tokenized_corpus = [
            regexp_tokenize(tweet, r'\S*') for tweet in self.detokenized_corpus
        ]
Example #29
0
def create_bert_embeddings(stories):
    sentences = []

    embedding_dimensions = 768
    single_index = SimpleNeighbors(embedding_dimensions)
    for story in stories.values():
        sentence = TreebankWordDetokenizer().detokenize(story['story'][0])
        #print(sentence)
        sentences.append(sentence)

    sbert_model = SentenceTransformer('stsb-roberta-base')
    sentence_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)

        # print("Key: ", key['movie_id'])
        # print("Embedding: ", embedding)
        # print("Vector Len: ", len(embedding))
        # input("Press any key...")
    return (single_index)
Example #30
0
def insert_brackets(tok_sent,
                    coref_range,
                    GENDER_PRONOUNS=[
                        'he', 'she', 'him', 'her', 'his', 'hers', 'himself',
                        'herself'
                    ]):
    start_bracket = ["<", "[", "{", "<<", "[[", "{{"]
    end_bracket = [">", "]", "}", ">>", "]]", "}}"]
    index = 0  #used to iterate through brackets array so each cluster will be within a different bracket
    for cluster in coref_range:
        if any([
            ((c[0] == c[1]) and (tok_sent[c[0]]).lower() in GENDER_PRONOUNS)
                for c in cluster
        ]):  #check if cluster contains a gender pronoun
            for (start_index, end_index) in cluster:
                tok_sent[
                    start_index] = start_bracket[index] + tok_sent[start_index]
                tok_sent[end_index] = tok_sent[end_index] + end_bracket[index]
            index += 1
    return TreebankWordDetokenizer().detokenize(tok_sent)
Example #31
0
#from six.moves import xrange
import os, sys
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer # nltk 3.3
from truecaser.Truecaser import *
import _pickle as cPickle

def convert(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
    #tokens = nltk.word_tokenize(sentence)
    return getTrueCase(tokens, 'as-is', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)

# load a truecase model$
print("load truecaser", file=sys.stderr)
#curr_dir = os.path.dirname(__file__)
f = open("truecaser/distributions.obj", 'rb')
uniDist = cPickle.load(f)
backwardBiDist = cPickle.load(f)
forwardBiDist = cPickle.load(f)
trigramDist = cPickle.load(f)
wordCasingLookup = cPickle.load(f)
f.close()

if __name__ == "__main__":
    sent = "I do n't have cats named Tom and Jerry ."
    tokens = [x.lower() for x in sent.split()]
    print(tokens)
    truecase_tokens = convert(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
    detokenizer = TreebankWordDetokenizer()
    sent = detokenizer.detokenize(truecase_tokens)
    print(sent)