Example #1
0
def preprocess_text(list_text):
    '''
    Function to preprocess documents. Preprocessing includes removing symbols in replace_with_space, removing any punctuation and removing stopwords. Return a 2-d list of preprocessed text
    '''

    #variable to be used for text preprocessing function preprocess_text
    replace_with_space = re.compile('[/(){}\[\]\|@,;]')
    symbols_to_remove = re.compile("[^a-z _]+")
    stop_words = set(stopwords.words('english'))
    added_stopwords = ['one', 'says', 'like', 'said', 'say', 'would', 'go']
    stop_words = set(list(stop_words) + added_stopwords)

    #list where preprocessed text will be stored.
    preprocessed_text = []
    tknzr = TreebankWordTokenizer()
    lmtzr = WordNetLemmatizer()
    #stemmer = PorterStemmer()
    for sentence in list_text:
        text = sentence.lower()
        text = re.sub(replace_with_space, " ", text)
        text_tokens = tknzr.tokenize(text)
        text_tokens = [
            token for token in text_tokens if token not in stop_words
        ]
        text_tokens = [lmtzr.lemmatize(token) for token in text_tokens]
        text = nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize(
            text_tokens)
        text = re.sub(symbols_to_remove, "", text)
        text_tokens = tknzr.tokenize(text)
        preprocessed_text.append(text_tokens)

    return preprocessed_text
Example #2
0
class Tokenizer:
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

    def tokenize(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)
        return tokens
Example #3
0
def tokenize(line):
    global tokenizer
    if args.skip_tokenization:
        return line
    if args.ptb:
        if tokenizer is None:
            tokenizer = TreebankWordTokenizer()
        return tokenizer.tokenize(line, convert_parentheses=True)
    return word_tokenize(line, language=args.language)
Example #4
0
  def test_word_tokenize_quotes(self):
    text = '"сл"'
    tokenizer = TreebankWordTokenizer()
    # _spans = nltk.word_tokenize(text)
    _spans = tokenizer.tokenize(text)

    spans = [s for s in _spans]
    print("".join(spans))
    for c in spans:
      print(len(c))
    self.assertEqual(3, len(spans))
class TokenizePreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, rules=True):

        self.punct = set(string.punctuation).difference(set('%='))

        self.rules = rules

        self.splitters = re.compile("[-/.,|<>]")
        self.tokenizer = TreebankWordTokenizer()

    def fit(self, X=None, y=None):
        return self

    @staticmethod
    def inverse_transform(X):
        return [", ".join(doc) for doc in X]

    def transform(self, X):
        return [self.token_representation(sentence) for sentence in X]

    def token_representation(self, sentence):
        return list(self.tokenize(sentence))

    def tokenize(self, sentence):
        """break sentence into pos-tagged tokens; normalize and split on hyphens"""

        # extremely short sentences shall be ignored by next steps
        if len(sentence) < MIN_LEN:
            yield "_empty_sentence_"
        else:
            for token in self.tokenizer.tokenize(sentence):
                # Apply preprocessing to the token
                token_nrm = self.normalize_token(token)
                subtokens = [
                    self.normalize_token(t)
                    for t in self.splitters.split(token_nrm)
                ]

                for subtoken in subtokens:
                    # If punctuation, ignore token and continue
                    if all(char in self.punct for char in token):
                        continue
                    yield subtoken

    def normalize_token(self, token):
        # Apply preprocessing to the token
        token = token.lower().strip().strip('*').strip('.')

        if self.rules:
            token = map_regex_concepts(token)

        return token
Example #6
0
class TreebankSpanTokenizer(TreebankWordTokenizer):
    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix + len(word_token)
            yield ix, end, word_token
            ix = end

    def tokenize(self, text):
        return self._word_tokenizer.tokenize(text)
Example #7
0
def iconize_corpus(args):
    """
    This script retrives the sentences that contains
    at least one icon term
    - fdata is the current corpus
    - fembed is the icon embedding file
    """
    # load embedding terms
    embdwrds = defaultdict(str)
    embdsyns = defaultdict(str)
    with open(args.fembd, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            terms = line.split()
            term = terms[0]
            code = terms[1]
            wtype = terms[2]
            if wtype == "main":
                embdwrds[term] = code
            else:
                embdsyns[term] = code

    # filter sentences that are oov w.r.t. the embedding
    tbt = TreebankWordTokenizer()
    plist = ["..", "...", "``", "''", "."]
    with open(args.fdata, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            sen = line.lower()
            sen = ''.join(i for i in sen if ord(i) < 123)
            sen = tbt.tokenize(sen)
            sen = [x for x in sen if not x in string.punctuation]
            sen = [x for x in sen if not x in plist]
            sentence = []
            for word in sen:
                code = embdwrds[word]
                if code != "":
                    sentence.append(code)
                elif embdsyns[word] != "":
                    code = embdsyns[word]
                    sentence.append(code)
                else:  # comment for pure icon mode
                    sentence.append(word)  # pure icon mode
            sentence = str.join(" ", sentence)
            print(sentence)
def tokenize(s: str) -> list:
    """
    Tokenize the given text using TreebankWordTokenizer delivered along with NLTK
    :param s: text
    :return: list of tokens
    """
    from nltk import TreebankWordTokenizer

    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(s)
    result = []
    for word in tokens:
        # the last "decode" function is because of Python3
        # http://stackoverflow.com/questions/2592764/what-does-a-b-prefix-before-a-python-string-mean
        w = unicodedata.normalize('NFKD', word).encode(
            'ascii', 'ignore').decode('utf-8').strip()
        # and add only if not empty (it happened in some data that there were empty tokens...)
        if w:
            result.append(w)

    return result
Example #9
0
class NormalizationTokenization:
    def __init__(self):
        self.letters_mappings = { u"á" : "a", 
                                  u"é" : "e", 
                                  u"í" : "i",
                                  u"ó" : "o",
                                  u"ú" : "u",
                                  u"ñ" : "n",
                                  u"ü" : "u" }
        self.tokenizer = TreebankWordTokenizer()
        
    def letter_without_accent(self, letter):
        'This method returns the version of a letter without accent'
        if letter in self.letters_mappings:
            return self.letters_mappings[letter]
        else:
            return letter
    
    def normalize(self, text):
        '''This method returns normalized version of the text.
        It removes all disallowed characters and makes the text lower case'''
        text = text.lower()
        mapIterator = map(lambda letter: self.letter_without_accent(letter), text)
        text = "".join(mapIterator)
        
        regex = r'[^a-zA-Z0-9\s\_\-\n]'
        text = re.sub(regex, '', text)
        return text
    
    def tokenize(self, text):
        'This method returns the text diveded to tokens'
        return self.tokenizer.tokenize(text)
    
    def process_text(self, text):
        '''This method is the main method of this class. 
        It processes the text and returns the result'''
        normalized_text = self.normalize(text)
        token_list = self.tokenize(normalized_text)
        return token_list
Example #10
0
"""import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
print(tokenizer.tokenize(a))"""

#split into word

from nltk.tokenize import word_tokenize
print(word_tokenize('Hello World.'))

#The word_tokenize() function is a wrapper function that calls tokenize() on an
#instance of the TreebankWordTokenizer class. It's equivalent to the following code:

from nltk import TreebankWordTokenizer, regexp_tokenize

tr = TreebankWordTokenizer()
print(tr.tokenize('Hello World.'))

#punktwordtokenizer
"""from nltk.tokenize import PunktWordTokenizer
tokenizer = PunktWordTokenizer()
print(tokenizer.tokenize("Can't is a contraction."))"""  #showing import error

#wordpuncttokenizer(saparate the punctuator)
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize("Can't is a contraction."))

#RegexpTokenizer

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
Example #11
0
tokenizer = TreebankWordTokenizer()
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = data.load(_POS_TAGGER)

reg = "[^\w]"

adv = 0
verb = 0
adj = 0
sust = 0

total = 0
for archivo in listdir("DIANN_Corpus"):
    archivo = open(path.join("DIANN_Corpus", archivo), 'r', encoding='utf8')
    texto_archivo = archivo.read()
    tokens = tokenizer.tokenize(texto_archivo)
    filtrado = []
    for j in tokens:
        tmp = sub(reg, "", j)
        if tmp != '':
            filtrado.append(tmp)
    tags = tagger.tag(filtrado)
    total = total + len(tags)
    for j in tags:
        if j[1] == "RB" or j[1] == "RBR" or j[1] == "RBS":
            adv = adv + 1
        elif j[1] == "JJ" or j[1] == "JJR" or j[1] == "JJS":
            adj = adj + 1
        elif j[1] == "NN" or j[1] == "NNS" or j[1] == "NNP" or j[1] == "NNPS":
            sust = sust + 1
        elif j[1] == "VB" or j[1] == "VBD" or j[1] == "VBG" or j[
# output in the same dir wih name like inputFIle + proprocessed


# wordTokenizer = RegexpTokenizer("[\w']+")


finalOutputFile = open(sys.argv[1] + "_preprocessed_sentences_splitted", 'w')
reviewsJSONFile = open(sys.argv[1], "r")
linenumber = 0

word_tokenizer = TreebankWordTokenizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for line in reviewsJSONFile:
    if linenumber % 1000 == 0:
        print(linenumber)
    linenumber += 1
    objJSON = json.loads(line)
    # tokenize and clean the review text
    reviewSTR = objJSON['reviewText']
    excludeSet = string.punctuation + string.digits
    tokenList = []
    sentList = sent_detector.tokenize(reviewSTR.strip())
    for sent in sentList:
        # removes digits punctuations and transforms to lower case.
        sent = ''.join(' ' if ch in set(excludeSet) else ch.lower() for ch in sent)
        tokens = word_tokenizer.tokenize(sent)
        finalOutputFile.write(' '.join(token for token in tokens) + "\n");


Example #13
0
reviewsJSONFile = open(sys.argv[1], "r")
prefix = sys.argv[3]

linenumber = 0
dummy_name = 0

word_tokenizer = TreebankWordTokenizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for line in reviewsJSONFile:
    if linenumber % 1000 == 0:
        print(linenumber)
    linenumber += 1
    objJSON = json.loads(line)
    # tokenize and clean the review text
    reviewSTR = objJSON['reviewText']
    excludeSet = string.punctuation + string.digits
    tokenList = []
    sentList = sent_detector.tokenize(reviewSTR.strip())
    for sent in sentList:
        # removes digits punctuations and transforms to lower case.
        sent = ''.join(' ' if ch in set(excludeSet) else ch.lower()
                       for ch in sent)
        tokenList += word_tokenizer.tokenize(sent)

    finalOutputFile.write(
        sys.argv[2] + prefix + str(dummy_name) + " " + sys.argv[2] + " " +
        ' '.join(token for token in tokenList) + "\n")  # name label data

    dummy_name += 1
y_test = np.array(list(map(int, read_data(test_label_path))))

y_train = np.array(list(map(lambda x: np.ceil(x/2.), y_train)))
y_test = np.array(list(map(lambda x: np.ceil(x/2.), y_test)))

all_y = np.concatenate([y_train, y_test])
all_text = train_text + test_text


for i in range(0, num_classes):
    data_list = []

    class_name = id2class[i]
    for x, y in zip(all_text, all_y):
        if y-1 == i:
            data_list.append(x)

    train_set, test_set = train_test_split(data_list, train_size=split_ratio[class_name])

    for text in train_set:
        t = tokenizer.tokenize(text, return_str=True)
        t = t.strip().replace('\t', ' ')
        fp_train.write(str(i) + '\t' + t + '\n')

    for text in test_set:
        t = tokenizer.tokenize(text, return_str=True)
        t = t.strip().replace('\t', ' ')
        fp_test.write(str(i) + '\t' + t + '\n')

fp_train.close()
fp_test.close()
Example #15
0
from nltk import WhitespaceTokenizer, SpaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer

if __name__ == '__main__':

    tokenizer = TreebankWordTokenizer()

    print('Using TreebankWordTokenizer:')
    sentence = "Sorry, I can't go to the workbench meeting.\n"
    tokens = tokenizer.tokenize(sentence)
    print(tokens)
    print('Number of tokens: ' + str(len(tokens)))

    print()

    tokenizer = WhitespaceTokenizer()

    print('Using WhitespaceTokenizer:')
    tokens = tokenizer.tokenize(sentence)
    print(tokens)
    print('Number of tokens: ' + str(len(tokens)))

    print()

    tokenizer = SpaceTokenizer()

    print('Using SpaceTokenizerr:')
    tokens = tokenizer.tokenize(sentence)
    print(tokens)
    print('Number of tokens: ' + str(len(tokens)))

    print()
Example #16
0
from nltk import TreebankWordTokenizer
from collections import Counter

sentence = """The faster Harry got to the store, the faster Harry, the faster, would get home."""
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentence.lower())
print(tokens)

bag_of_words = Counter(tokens)
print(bag_of_words)
print(bag_of_words.most_common(4))

times_harry_appears = bag_of_words['harry']
print(times_harry_appears)

nums_unique_words = len(bag_of_words)
print("nums_unique_word:{}".format(nums_unique_words))
tf = times_harry_appears / nums_unique_words
print(round(tf, 4))
Example #17
0
from nltk import TreebankWordTokenizer

doc_0 = "The faster Harry got to the store, the faster Harry, the faster, would get home."
doc_1 = "Harry is hairy and faster than Jill."
doc_2 = "Jill is not as hairy as Harry."

docs = []
docs.append(doc_0)
docs.append(doc_1)
docs.append(doc_2)

tokenizer = TreebankWordTokenizer()
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]

print(doc_tokens)
print("length doc 0 :{}".format(len(doc_tokens[0])))
print("length doc 0 :{}".format(len(doc_tokens[1])))
print("length doc 0 :{}".format(len(doc_tokens[2])))

all_doc_tokens = sum(doc_tokens, [])
print("all docs tokens length:{}".format(len(all_doc_tokens)))
lexicon = sorted(set(all_doc_tokens))
print("lexicon tokens length:{}".format(len(lexicon)))

from collections import OrderedDict
zero_vector = OrderedDict((token, 0) for token in lexicon)
print("zero vector:{}".format(zero_vector))

import copy
    loop_num = 0
    while ratio < 0.6:
        loop_num += 1
        vocabulary = vocabulary[:-200]
        num_words = len(vocabulary)
        partition1, partition2, ratio = create_data_partitions(
            data_list, vocabulary)
        print(str(loop_num) + '\t' + class_name + '\t' + str(ratio))

        if len(vocabulary) <= min_vocab:
            break

    print('Final Ratio\t' + class_name + '\t' + str(ratio))
    #
    for id_ in partition1:
        if id_ >= num_words:
            text_string = data_list[id_ - num_words]
            t = tokenizer.tokenize(text_string, return_str=True)
            t = t.strip().replace('\t', ' ')
            fp_train.write(str(i) + '\t' + t + '\n')

    for id_ in partition2:
        if id_ >= num_words:
            text_string = data_list[id_ - num_words]
            t = tokenizer.tokenize(text_string, return_str=True)
            t = t.strip().replace('\t', ' ')
            fp_test.write(str(i) + '\t' + t + '\n')

fp_train.close()
fp_test.close()
Example #19
0
class Tokenizer():
    def __init__(self, start_token: str, end_token: str, unk_token: str, num_words: int = None, max_seq_len: int = 100):
        self.treebank_word_tokenizer = TreebankWordTokenizer()
        improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
        improved_close_quote_regex = re.compile(u'([»”’])', re.U)
        improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
        self.treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
        self.treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
        self.treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))

        self.word_counts = OrderedDict()
        self.word_docs = {}
        self.num_words = num_words
        self.document_count = 0

        self.START_TOKEN = start_token
        self.END_TOKEN = end_token
        self.UNK_TOKEN = unk_token
        self.MAX_SEQ_LEN = max_seq_len

    def fit_on_texts(self, texts: list):
        self.document_count = 0
        for text in texts:
            self.document_count += 1
            seq = self.treebank_word_tokenizer.tokenize(text)
            for word in seq:
                if word in self.word_counts:
                    self.word_counts[word] += 1
                else:
                    self.word_counts[word] = 1
            for word in set(seq):
                if word in self.word_docs:
                    self.word_docs[word] += 1
                else:
                    self.word_docs[word] = 1

        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)
        sorted_voc = [wc[0] for wc in wcounts]

        # note that indices 0,1,2,3 are reserved, never assigned to an existing word
        special_token_count = 3
        self.word_index = dict(
            list(zip(sorted_voc, list(range(1 + special_token_count, len(sorted_voc) + 1 + special_token_count)))))
        self.word_index[self.START_TOKEN] = 1
        self.word_index[self.END_TOKEN] = 2
        self.word_index[self.UNK_TOKEN] = 3
        index_docs = {}
        for word, count in list(self.word_docs.items()):
            index_docs[self.word_index[word]] = count

    def texts_to_sequences(self, texts: list, search_related_word: bool = False, print_unk_warning: bool = False,
                           lower_flag: bool = False):
        res = []
        for vect in self.__texts_to_sequences_generator(texts, search_related_word, print_unk_warning, lower_flag):
            res.append(vect)
        return res

    def __texts_to_sequences_generator(self, texts: list, search_related_word: bool, print_unk_warning: bool,
                                       lower_flag: bool):
        """Transforms each text in texts in a sequence of integers.

        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Yields
            Yields individual sequences.
        """
        num_words = self.num_words
        for text in texts:
            seq = self.treebank_word_tokenizer.tokenize(text)
            seq = [self.START_TOKEN] + seq + [self.END_TOKEN]
            vect = []
            for w in seq:
                if lower_flag is True:
                    w = w.lower()
                i = self.word_index.get(w)
                if i is not None:
                    if num_words and i >= num_words:
                        if search_related_word is True:
                            i = self.__find_idx_of_related_known_word(w)
                            if i is None:
                                if print_unk_warning is True:
                                    print("\nATTENTION:", w, "is unknown.\n")
                                vect.append(self.word_index.get(self.UNK_TOKEN))
                                continue
                            else:
                                vect.append(i)
                        else:
                            if print_unk_warning is True:
                                print("\nATTENTION:", w, "is unknown.\n")
                            vect.append(self.word_index.get(self.UNK_TOKEN))
                            continue
                            # TODO: what is with out of vocab token? (fasttext and glove)
                    else:
                        vect.append(i)
                else:
                    if search_related_word is True:
                        i = self.__find_idx_of_related_known_word(2)
                        if i is not None:
                            vect.append(i)
                            continue
                    if print_unk_warning is True:
                        print("\nATTENTION:", w, "is unknown.\n")
                    vect.append(self.word_index.get(self.UNK_TOKEN))
            yield vect
Example #20
0
def tokenizing(words):
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(words)
    return tokens
linenumber = 0
dummy_name = 0

word_tokenizer = TreebankWordTokenizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for line in reviewsJSONFile:
    if linenumber % 1000 == 0:
        print(linenumber)
    linenumber += 1
    objJSON = json.loads(line)
    # tokenize and clean the review text
    reviewSTR = objJSON['reviewText']
    excludeSet = string.punctuation + string.digits
    tokenList = []
    sentList = sent_detector.tokenize(reviewSTR.strip())
    for sent in sentList:
        # removes digits punctuations and transforms to lower case.
        sent = ''.join(' ' if ch in set(excludeSet) else ch.lower() for ch in sent)
        tokenList += word_tokenizer.tokenize(sent)

    finalOutputFile.write(sys.argv[2] +
                          prefix+str(dummy_name) + " "
                          + sys.argv[2] + " "
                          + ' '.join(token for token in tokenList)
                          + "\n")  # name label data

    dummy_name += 1

Example #22
0
class HindiLanguage(StopWordsFromFileMixIn):
    """Hindi language support module."""

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Hunspell instance
        '__hindi_hunspell',

        # Word tokenizer
        '__treebank_tokenizer',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__treebank_tokenizer = TreebankWordTokenizer()

        hunspell_dict_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'hindi-hunspell',
            'dict-hi_IN',
        )
        if not os.path.isdir(hunspell_dict_dir):
            raise McLanguageException(
                "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir
            )

        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')):
            raise McLanguageException("Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir)
        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')):
            raise McLanguageException("Hunspell affix file does not exist at path: %s" % hunspell_dict_dir)

        try:
            self.__hindi_hunspell = Hunspell(lang='hi_IN', hunspell_data_dir=hunspell_dict_dir)
        except Exception as ex:
            raise McLanguageException(
                "Unable to initialize Hunspell with data directory '%s': %s" % (hunspell_dict_dir, str(ex),)
            )

        # Quick self-test to make sure that Hunspell is installed and dictionary is available
        hunspell_exc_message = """
            Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g.
            you might need to fetch Git submodules by running:

                git submodule update --init --recursive
        """
        try:
            test_stems = self.stem_words(['गुरुओं'])
        except Exception as _:
            raise McLanguageException(hunspell_exc_message)
        else:
            if len(test_stems) == 0 or test_stems[0] != 'गुरु':
                raise McLanguageException(hunspell_exc_message)

    @staticmethod
    def language_code() -> str:
        return "hi"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले "
            "विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:
                term_stems = self.__hindi_hunspell.stem(word)
                if len(term_stems) > 0:
                    stem = term_stems[0]

                    if stem is None or len(stem) == 0:
                        log.debug("Stem for word '%s' is empty or None." % word)
                        stem = word

                else:
                    log.debug("Stem for word '%s' was not found." % word)
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        return stems

    def split_text_to_sentences(self, text: str) -> List[str]:
        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period
        text = text.replace("।", "।\n\n")

        # No non-breaking prefixes in Hausa, so using English file
        en = EnglishLanguage()
        return en.split_text_to_sentences(text)

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        # Normalize apostrophe so that "it’s" and "it's" get treated identically
        sentence = sentence.replace("’", "'")

        # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period
        sentence = sentence.replace("।", ".")

        # TweetTokenizer / sentence_splitter don't work with Hindi for whatever reason, and word_tokenize() would
        # require NLTK data to be installed which is time consuming on Travis
        tokens = self.__treebank_tokenizer.tokenize(sentence)

        def is_word(token_: str) -> bool:
            """Returns True if token looks like a word."""
            if re.match(pattern=r'\w', string=token_, flags=re.UNICODE):
                return True
            else:
                return False

        # TweetTokenizer leaves punctuation in-place
        tokens = [token for token in tokens if is_word(token)]

        return tokens
from nltk import word_tokenize, TreebankWordTokenizer

# usage app inputFile
# output in the same dir wih name like inputFIle + proprocessed

# wordTokenizer = RegexpTokenizer("[\w']+")

finalOutputFile = open(sys.argv[1] + "_preprocessed_sentences_splitted", 'w')
reviewsJSONFile = open(sys.argv[1], "r")
linenumber = 0

word_tokenizer = TreebankWordTokenizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for line in reviewsJSONFile:
    if linenumber % 1000 == 0:
        print(linenumber)
    linenumber += 1
    objJSON = json.loads(line)
    # tokenize and clean the review text
    reviewSTR = objJSON['reviewText']
    excludeSet = string.punctuation + string.digits
    tokenList = []
    sentList = sent_detector.tokenize(reviewSTR.strip())
    for sent in sentList:
        # removes digits punctuations and transforms to lower case.
        sent = ''.join(' ' if ch in set(excludeSet) else ch.lower()
                       for ch in sent)
        tokens = word_tokenizer.tokenize(sent)
        finalOutputFile.write(' '.join(token for token in tokens) + "\n")