Beispiel #1
0
def lemmatize_text(text):
    lematizer = WordNetLemmatizer()
    toktok = ToktokTokenizer()

    text = ' '.join(
        [lematizer.lemmatize(word) for word in toktok.tokenize(text)])
    return text
Beispiel #2
0
    def clean_text(self, text):
        """This function is used to clean text for sentimental analysis
        @Author: Adarsh Koppa Manjunath
        @Parameters:
            text(str): text to be cleaned
        @return
            final_output(dict): url and serach result"""
        try:

            #remove square brackets
            text = re.sub('\[[^]]*\]', '', text)
            #remove digits
            pattern = r'[^a-zA-z0-9\s]'
            text = re.sub(pattern, '', text)
            #steming the text
            ps = nltk.porter.PorterStemmer()
            text = ' '.join([ps.stem(word) for word in text.split()])
            #tokenization and stop words removal
            tokenizer = ToktokTokenizer()
            stopword_list = set(stopwords.words('english'))
            tokens = tokenizer.tokenize(text)
            tokens = [token.strip() for token in tokens]
            filtered_tokens = [
                token for token in tokens if token.lower() not in stopword_list
            ]

            return filtered_tokens

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return "exception: failed"
Beispiel #3
0
def tokenize_sentence(sentence, lang=None, punctList=None):

    if lang == None:
        lang = 'English'
    if punctList == None:
        punctList = [';', ':', ',', '.', '...', '``', "''", '¡', '!', '¿', '?']

    if lang == 'Spanish':
        nltk.download('perluniprops')
        nltk.download('nonbreaking_prefixes')
        from nltk.tokenize.toktok import ToktokTokenizer
        toktok = ToktokTokenizer()

    if lang == 'Spanish':
        string = sentence.decode('utf-8')
        tokens = toktok.tokenize(string)
        words = []
        for token in tokens:
            if not token in punctList:
                words.append(token)
    if lang == 'English':
        string = sentence
        try:
            tokens = nltk.word_tokenize(string)

        except:
            tokens = nltk.word_tokenize(string.decode('utf-8'))
        words = []
        for token in tokens:
            if not token in punctList:
                words.append(token)
    return words
Beispiel #4
0
    def prepareToClf(self, text):
        txt = str(text)
        # Tokenize tweets. Word splitting.
        exclusionList = [
            r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '-&gt'
        ]
        exclusions = '|'.join(exclusionList)
        txt = re.sub(exclusions, '', ''.join(txt).rstrip(), flags=re.MULTILINE)
        toktok = ToktokTokenizer()
        tokens = toktok.tokenize(txt)
        words = tokens
        words = [word.lower() for word in words]
        from stopwords_ca import get_stopwords
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE)
        # hem agafat els stop_words de http://latel.upf.edu/morgana/altres/pub/ca_stop.htm (ens hem fet la nostra propia funció)
        stop_words = get_stopwords()
        words = [
            emoji_pattern.sub(r'', w) for w in words if not w in stop_words
        ]  # NO EMOJI
        table = str.maketrans('', '', ''.join([string.punctuation, "’"]))
        words = [w.translate(table) for w in words]

        import unidecode
        unaccented_string = unidecode.unidecode(','.join(words))
        return self.tf_vectorizer.transform([unaccented_string]).toarray()
Beispiel #5
0
 def _tokenizer(self, x, quit_commons=True):
     """
   Aplicar el tokenizado a una cadena de texto. Pasa a minúsculas, elimina caracteres especiales, 
       stopwords, números, nombres propios. 
     
   Args:
       x (str): Cadena que tokenizar.
       quit_common (bool): Si se desean eliminar también una lista de palabras comunes. Por defecto: True.
 
   Returns: 
     list: Lista de tokens. 
 
   """
     toktok = ToktokTokenizer()
     common_words = []
     if quit_commons:
         common_words = commons
     x_lower = x.lower().replace("o dos", "o2")
     tokens_not_filter = [
         unidecode(item.lower()) for item in toktok.tokenize(x)
     ]
     tokens = [
         item for item in tokens_not_filter
         if item not in stopwords.words('spanish') and item not in numwords
         and item not in common_words and item not in names
         and len(item) > 2
     ]
     return tokens
Beispiel #6
0
    def create(cls,
               df_path,
               file_names=None,
               min_char_len=1,
               model_name="bert-base-multilingual-cased",
               max_sequence_length=424,
               pad_idx=0,
               clear_cache=False,
               df=None):
        tokenizer = BertTokenizer.from_pretrained(model_name)
        config = {
            "file_names": file_names,
            "min_char_len": min_char_len,
            "model_name": model_name,
            "max_sequence_length": max_sequence_length,
            "clear_cache": clear_cache,
            "df_path": df_path,
            "pad_idx": pad_idx
        }
        if clear_cache:
            df = cls.files2sentences_df(file_names, min_char_len)
        elif df is None:
            df = pd.read_csv(df_path, sep='\t')
        elif isinstance(df, list):
            df = pd.DataFrame({"text": df})

        self = cls(tokenizer,
                   word_tokenizer=ToktokTokenizer(),
                   df=df,
                   config=config)
        if clear_cache:
            self.save()
        return self
def parse_data(data):
    """
    Parse all unique sentences in data.
    
    :param data: pandas.DataFrame with text data
    :returns parsed_data:: pandas.DataFrame with text data
    """
    parser_en = spacy.load('en_core_web_md', disable=['ner', 'textcat'])
    parser_es = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])
    # custom tokenizers because duh
    parser_en.tokenizer = NLTKTokenizerSpacy(parser_en.vocab, TweetTokenizer())
    parser_es.tokenizer = NLTKTokenizerSpacy(parser_es.vocab, ToktokTokenizer())
    data.loc[:, 'lang'] = data.loc[:, 'txt'].apply(lambda x: langid.classify(x)[0])
    parsed_data = []
    for i, data_i in data.iterrows():
        txt = data_i.loc['txt']
        txt = clean_data_for_spacy(txt)
        sents = sent_tokenize(txt)
        parsed_data_i = []
        for sent in sents:
            if(data_i.loc['lang'] == 'es'):
                parse_i = parser_es(sent)
            else:
                parse_i = parser_en(sent)
            # extract tree
            tree_i = build_parse(parse_i, parse_type='spacy')
            parsed_data_i.append(tree_i)
        parsed_data_i = pd.DataFrame(pd.Series(parsed_data_i), columns=['parse'])
#         logging.debug('processing id %s/%s'%(data_i.loc['id'], int(data_i.loc['id'])))
        parsed_data_i = parsed_data_i.assign(**{'id' : int(data_i.loc['id'])})
        parsed_data.append(parsed_data_i)
    parsed_data = pd.concat(parsed_data, axis=0)
#     parsed_data.loc[:, 'id'] = parsed_data.loc[:, 'id'].astype(np.int64)
    return parsed_data
Beispiel #8
0
def process_imdb_data():
    # Tokenization of text
    tokenizer = ToktokTokenizer()
    # Setting English stopwords
    stopword_list = nltk.corpus.stopwords.words('english')

    imdb_data = pd.read_csv('./../Datasets/IMDB/IMDB_Dataset.csv')

    # Apply function on review column
    imdb_data['review'] = imdb_data['review'].apply(denoise_text)

    # Apply function on review column
    imdb_data['review'] = imdb_data['review'].apply(remove_special_characters)

    # Apply function on review column
    imdb_data['review'] = imdb_data['review'].apply(simple_stemmer)

    # Apply function on review column
    imdb_data['review'] = imdb_data['review'].apply(remove_stopwords,
                                                    args=(tokenizer,
                                                          stopword_list))

    imdb_data.to_csv('./../Datasets/IMDB/Processed_IMDB_Dataset.csv')

    #Tfidf vectorizer
    tv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1, 3))
    #transformed train reviews
    tv_reviews = tv.fit_transform(imdb_data.review)
 def __init__(self,
              stopwords: list = None,
              ngram_range: List[int] = None,
              lemmas=False,
              lowercase: bool = None,
              alphas_only: bool = None,
              **kwargs):
     """
     :param stopwords: a set of words to skip
     :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
     [1, 2], for bigrams only should be set to [2, 2]
     :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only
     for the English language
     :param lowercase: perform lowercasing or not
     :param alphas_only: should filter numeric and alpha-numeric types or not
     """
     if ngram_range is None:
         ngram_range = [1, 1]
     self._stopwords = stopwords or []
     self.tokenizer = ToktokTokenizer()
     self.lemmatizer = pymorphy2.MorphAnalyzer()
     self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple
     self.lemmas = lemmas
     self.lowercase = lowercase
     self.alphas_only = alphas_only
     self.tok2morph = {}
def vectorizerV2(raw_text, vectorWords):
    toktok = ToktokTokenizer()
    tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    sentences = tokenizer.tokenize(raw_text)

    vector = []

    counterCommas = 0
    counterPoints = raw_text.count(".")
    countersWordsInSentence = []
    for sentence in sentences:
        counterCommas += sentence.count(",")
        countersWordsInSentence.append(len(toktok.tokenize(sentence)))
        for token in toktok.tokenize(sentence):
            vectorWords[token] += 1

    vector.append(counterCommas)
    vector.append(counterPoints)

    sumatory = 0
    for counter in countersWordsInSentence:
        sumatory += counter

    averageWordsInSentence = sumatory / len(countersWordsInSentence)

    vector.append(averageWordsInSentence)

    vector.append(len(sentences))

    for word, count in vectorWords.items():
        vector.append(count)

    # número de comas | número de puntos | promedio de palabras por oración | número de oraciones | número de veces que aparece una palabra del conjunto completo en el texto ...
    return np.array(vector)
Beispiel #11
0
    def __init__(self,
                 stopwords: Optional[List[str]] = None,
                 ngram_range: List[int] = None,
                 lemmas: bool = False,
                 lowercase: Optional[bool] = None,
                 alphas_only: Optional[bool] = None,
                 save_path: Optional[str] = None,
                 load_path: Optional[str] = None,
                 **kwargs):

        if ngram_range is None:
            ngram_range = [1, 1]
        self.stopwords = stopwords or []
        self.tokenizer = ToktokTokenizer()
        self.lemmatizer = pymorphy2.MorphAnalyzer()
        self.ngram_range = tuple(ngram_range)  # cast JSON array to tuple
        self.lemmas = lemmas
        self.lowercase = lowercase
        self.alphas_only = alphas_only
        self.manager = mp.Manager()
        self.tok2morph = self.manager.dict()
        self.load_path = load_path
        self.save_path = save_path
        if load_path:
            load(load_path)
        self.nglist = self.manager.dict()
def vectorizer(raw_text):
    toktok = ToktokTokenizer()
    tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
    sentences = tokenizer.tokenize(raw_text)

    vector = []

    counterCommas = 0
    countersWordsInSentence = []
    for sentence in sentences:
        counterCommas += sentence.count(",")
        countersWordsInSentence.append(len(toktok.tokenize(sentence)))

    vector.append(counterCommas)

    sumatory = 0
    for counter in countersWordsInSentence:
        sumatory += counter

    averageWordsInSentence = sumatory / len(countersWordsInSentence)

    vector.append(averageWordsInSentence)

    vector.append(len(sentences))

    return np.array(vector)
Beispiel #13
0
def main():
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words("english")
    stopword_list.remove("no")
    stopword_list.remove("not")
    stopword_set = set(stopword_list)
    df = pd.read_csv("lyrics_urls.csv", delimiter="\t")
    updated_lyrics = {'artist_name': [], 'song_title': [], 'lyrics': []}
    for index, lyric_url in enumerate(df["lyric_url"]):
        print(lyric_url)
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
        }
        genius_lyric_page = requests.get(lyric_url, headers=headers)
        soup = BeautifulSoup(genius_lyric_page.text, "html.parser")
        genius_lyrics_content = soup.select("p")[0]
        verses = remove_song_sections_tags(
            genius_lyrics_content.text).split("\n")
        lyrics = [
            text_processing(verse, tokenizer, stopword_set) for verse in verses
            if verse != ""
        ]
        updated_lyrics['artist_name'].append(df.loc[index]['artist'])
        updated_lyrics['song_title'].append(df.loc[index]['title'])
        updated_lyrics['lyrics'].append(lyrics)

    pd.DataFrame(updated_lyrics).to_csv('lyric_data.csv', sep="\t")
Beispiel #14
0
def remove_stopwords(text):
    '''
    text should be in lower case
    Input: "The, and, if are stopwords, computer is not"
    Output: ", , stopwords , computer not"
    '''
    stopwords_sklrn = frozenset(text.ENGLISH_STOP_WORDS)
    stopwords_nltk = frozenset(stopwords.words('english'))
    stopwords_wrdcld = frozenset(STOPWORDS)
    all_stopwords = frozenset(
        pd.Series(
            list(stopwords_sklrn) + list(stopwords_nltk) +
            list(stopwords_wrdcld)).unique())
    # print('# of stopwords in each lib: ',len(stopwords_sklrn), len(stopwords_nltk), len(stopwords_wrdcld))
    # print('# of stopwords when aggregated:', len(all_stopwords))

    ## Removing some words from stopwords
    stopword_list = list(all_stopwords)
    excpt_stopword = ['no', 'not']
    for ele in excpt_stopword:
        stopword_list.remove(ele)

    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [
        token for token in tokens if token.lower() not in stopword_list
    ]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
 def __init__(self, seed=42, ngram_range=(1, 3)):
     self.seed = seed
     self.init_seed()
     self.ngram_range = ngram_range
     self.vectorizer = TfidfVectorizer(ngram_range=ngram_range)
     self.clf = LinearSVC(multi_class="ovr")
     self.word_tokenizer = ToktokTokenizer()
Beispiel #16
0
def create_spanish_english_alignments(spa_file, eng_file, spa_trans_file):

    toktok = ToktokTokenizer()
    
    massalign_sentence_pairs = get_massalign_sentence_pairs(spa_trans_file, eng_file)
    
    ''' To map to original spanish segment, you can either store the translation
        at sentence level or use Gale church to get sentence alignments from the documents.
    '''
    translation_sentence_pairs = sentence_align(spa_file, spa_trans_file, 0.97, 1.8)
    
    pairs = []
    for eng_trans, eng_org in massalign_sentence_pairs:
        eng_simple_tok_1 = toktok.tokenize(eng_trans)
        
        spanish = ''
        prev_spa = ''
        for spa, eng in translation_sentence_pairs:
            eng_simple_tok_2 = toktok.tokenize(eng)
        
            I = len(set(eng_simple_tok_2).intersection(set(eng_simple_tok_1)))
            U = len(set(eng_simple_tok_2))
            try:
                percent_overlap = float(I)/U
                if percent_overlap > 0.5 and spa!=prev_spa:
                    spanish += spa
                    prev_spa = spa
                    break
            except:
                continue
        if spanish != '':
            pairs.append([spanish, eng_org])
    return pairs
    def tokenize(self,
                 text,
                 a_preserve_case=True,
                 a_reduce_len=False,
                 a_strip_handles=False):

        own_tokenizer = None
        tokens = []
        own_extend = tokens.extend

        if self.__token_whitespace:
            tokens = text.split(" ")

        elif self.__language == "persian":
            own_tokenizer = ToktokTokenizer()
            for t in text:
                own_extend(own_tokenizer.tokenize(t))
        else:
            own_tokenizer = nltk_data.load("tokenizers/punkt/" +
                                           self.__language + ".pickle")
            sents = own_tokenizer.tokenize(text)
            for sent in sents:
                own_extend(word_tokenize(sent, language=self.__language))

        return tokens
Beispiel #18
0
    def __init__(self, stopwords: str):
        self.rgc = re.compile('[^a-zа-яё0-9-_]')
        self.tokenizer = ToktokTokenizer()
        self.stemmer = PorterStemmer()
        self.lemmatizer = pymorphy2.MorphAnalyzer()

        with open(stopwords, 'r') as f:
            self.stopwords = set(f.read().split('\n'))
Beispiel #19
0
def get_keywords(text):
    text_without_punct = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    toktok = ToktokTokenizer()
    texto_tokenized = toktok.tokenize(text_without_punct.lower())

    keywords = [word for word in texto_tokenized if word not in stopwords.words('spanish')]

    return keywords
def build_word_frequency(filepath, language, output_path):
    """ Parse the passed in text file (likely from Open Subtitles) into
        a word frequency list and write it out to disk

        Args:
            filepath (str):
            language (str):
            output_path (str):
        Returns:
            Counter: The word frequency as parsed from the file
        Note:
            This only removes words that are proper nouns (attempts to...) and
            anything that starts or stops with something that is not in the alphabet.
    """
    # NLTK is only needed in this portion of the project
    try:
        from nltk.tag import pos_tag
        from nltk.tokenize import WhitespaceTokenizer
        from nltk.tokenize.toktok import ToktokTokenizer
    except ImportError as ex:
        raise ImportError(
            "To build a dictioary from scratch, NLTK is required!\n{}".format(
                ex.message))

    word_frequency = Counter()
    if language == "es":
        tok = ToktokTokenizer()
    else:
        tok = WhitespaceTokenizer()

    idx = 0
    with load_file(filepath, 'utf-8') as fobj:
        for line in fobj:
            # tokenize into parts
            parts = tok.tokenize(line)

            # Attempt to remove proper nouns
            # Remove things that have leading or trailing non-alphabetic characters.
            tagged_sent = pos_tag(parts)
            words = [
                word[0].lower() for word in tagged_sent
                if word[0] and not word[1] == "NNP" and word[0][0].isalpha()
                and word[0][-1].isalpha()
            ]

            # print(words)
            if words:
                word_frequency.update(words)

            idx += 1

            if idx % 100000 == 0:
                print("completed: {} rows".format(idx))
        # end file loop
    print("completed: {} rows".format(idx))
    export_word_frequency(output_path, word_frequency)

    return word_frequency
 def __init__(self, seed=42):
     self.seed = seed
     self.init_seed()
     self.is_loaded = False
     self.tokenizer = ToktokTokenizer()
     self.morph = morph
     self.count_vectorizer = CountVectorizer(ngram_range=(1, 4),
                                             tokenizer=str.split)
     self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
def main():
    tokenizer = ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')

    feature_fns = [token_features, token_pair_features, lexicon_features]
    # Read data from Excel
    docs, labels = get_labeled_data('train')
    # Evaluate accuracy of many combinations
    # of tokenization/featurization.
    results = eval_all_combinations(docs, labels, [True, False], feature_fns,
                                    [2, 5, 10])
    # Print information about these results.
    best_result = results[0]
    worst_result = results[-1]
    print('best cross-validation result:\n%s' % str(best_result))
    print('worst cross-validation result:\n%s' % str(worst_result))

    clf, vocab = fit_best_classifier(docs, labels, results[0])

    # Print top coefficients per class.
    print('\nTOP COEFFICIENTS PER CLASS:')
    print('negative words:')
    print('\n'.join(
        ['%s: %.5f' % (t, v) for t, v in top_coefs(clf, 0, 5, vocab)]))
    print('\npositive words:')
    print('\n'.join(
        ['%s: %.5f' % (t, v) for t, v in top_coefs(clf, 1, 5, vocab)]))

    # Parse test data
    test_docs, test_labels, X_test = parse_test_data(best_result, vocab)

    # Evaluate on test set.
    predictions = clf.predict(X_test)
    accuracy_test = accuracy_score(test_labels, predictions)
    print('testing accuracy=%f' % accuracy_test + "\n")

    # Dumping the accuracy to a pickle file
    pickle.dump(accuracy_test, open('./test_accuracy.pkl', 'wb'))

    ### 'misclassified' for missclassified
    ### 'actual' for positive and negative
    print('\nTOP MISCLASSIFIED TEST DOCUMENTS:')
    misclassified_list = print_top_predicted(test_docs, test_labels, X_test,
                                             clf, 5, 'misclassified')
    pickle.dump(misclassified_list, open('./misclassified_list.pkl', 'wb'))

    print('\nTOP NEGATIVE TEST DOCUMENTS:')
    negative_predicted_list = print_top_predicted(test_docs, test_labels,
                                                  X_test, clf, 5, 'actual', 0)
    pickle.dump(negative_predicted_list,
                open('./negative_predicted_list.pkl', 'wb'))

    print('\nTOP POSITIVE TEST DOCUMENTS:')
    positive_predicted_list = print_top_predicted(test_docs, test_labels,
                                                  X_test, clf, 5, 'actual', 1)
    pickle.dump(positive_predicted_list,
                open('./positive_predicted_list.pkl', 'wb'))
Beispiel #23
0
 def __init__(self):
     # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
     self._tok = ToktokTokenizer()
     # self._tok = MosesTokenizer(lang='en')
     self._stemmer = SnowballStemmer('english')
     self._lemmatizer = TreeTagger(language='english')
     self._stopwords = set(open(STOPWORDS).read().splitlines())
     # istopwords.words('french') #
     self._porter_stemmer = nltk.stem.porter.PorterStemmer()
Beispiel #24
0
 def __init__(self, seed=42):
     self.seed = seed
     self.init_seed()
     self.tokenizer = ToktokTokenizer()
     self.morph = pymorphy2.MorphAnalyzer()
     self.count_vectorizer = CountVectorizer(ngram_range=(1, 4),
                                             tokenizer=str.split)
     self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
     super().__init__()
Beispiel #25
0
def buscar_palabras(dataset, palabras_dataset):
    tokenizador = ToktokTokenizer()
    palabras = tokenizador.tokenize(dataset)
    datos = {}

    for p in palabras_dataset:
        datos[p] = (p in palabras)

    return datos
Beispiel #26
0
def get_tokenizer(tokenizer):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from sacremoses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install SacreMoses. "
                  "See the docs at https://github.com/alvations/sacremoses "
                  "for more information.")
            raise
    elif tokenizer == "toktok":
        try:
            from nltk.tokenize.toktok import ToktokTokenizer
            toktok = ToktokTokenizer()
            return toktok.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at https://nltk.org  for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return lambda x: revtok.tokenize(x, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer))
def process_stopwords():
    f = open("stopwords")
    wordList = f.readlines()
    stopwords = []

    for stopword in wordList:
        stopwords.append(stopword.strip('\n'))

    tokenizer = ToktokTokenizer()
    return tokenizer, stopwords
Beispiel #28
0
def my_tokenizer(iterator):
    global max_len
    tknzr = ToktokTokenizer()
    for value in iterator:
        value = value.replace('-', " - ")
        value = value.replace('/', " / ")
        value = value.lower()
        value = tknzr.tokenize(value)
        max_len = max(max_len, len(value))
        yield value
Beispiel #29
0
 def __init__(self):
     self.tokenizer = ToktokTokenizer()
     self.word_vectorizer = TfidfVectorizer(
         tokenizer=self.tokenizer.tokenize, lowercase=True, analyzer="word")
     self.char_vectorizer = TfidfVectorizer(
         tokenizer=self.tokenizer.tokenize,
         lowercase=True,
         analyzer="char",
         ngram_range=(1, 3),
     )
Beispiel #30
0
def remove_stopwords(text, is_lower_case=False):
    stopword_list = nltk.corpus.stopwords.words('english')
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text