def stanford_tokenize(
        texts: typing.List[str]) -> typing.List[typing.List[str]]:
    """
    This function takes string list and then tokenize every str in the list.
    """
    tokenizer = StanfordTokenizer()
    return tokenizer.tokenize_sents(texts)
Exemple #2
0
def tokenize_stopwords_stemmer(texts):
    #texts:列表存放的字符串
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts = texts[0]
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar")
    texts_tokenized = tokenizer.tokenize(Str_texts)  #输入必须是字符串

    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)
    texts_filtered0 = [
        document for document in texts_tokenized
        if not document in pa1.findall(document)
    ]

    p2 = r'.+[-_\./].+'
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_') > -1:
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
        else:
            texts_filtered.append(document)

    texts_filtered = [
        document for document in texts_filtered
        if document != '' and document != "''" and document != "``"
    ]

    #stopwords
    english_stopwords = stopwords.words('english')  #得到停词
    texts_filtered_stopwords = [
        document for document in texts_filtered
        if not document in english_stopwords
    ]  #

    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<',
        '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/',
        '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?'
    ]  #得到标点

    texts_filtered = [
        document for document in texts_filtered_stopwords
        if not document in english_punctuations
    ]  #

    porter = nltk.PorterStemmer()
    texts_Stemmered = [porter.stem(t) for t in texts_filtered]  #列表类型

    return texts_Stemmered  #返回一个列表
    """
Exemple #3
0
def segment(texts):
    tk = StanfordTokenizer()
    results = {}
    for text in texts:
        words = tk.tokenize(text)
        segmented = ' '.join(words).lower()
        results[text] = segmented
    return results
Exemple #4
0
def tokenize_q(qa, phase):
    qas = len(qa)
    MyTokenizer = StanfordTokenizer()
    for i, row in enumerate(tqdm(qa)):
        row['question_toked'] = MyTokenizer.tokenize(row['question'].lower())[:14]
        if i % 50000 == 0:
            json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w'))
        if i == qas - 1:
            json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
Exemple #5
0
def simp_syn_sent(sent):
    strs = ""
    # the original tokens in the sent


    #import pdb; pdb.set_trace()
    #print "syn sent: ", sent
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(sent)
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w

    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)


    #import pdb; pdb.set_trace()
    if len(sent) > 0:
        strs = simp_coordi_sent(tokens, node_list)
        if len(strs) > 0:
            return strs
        else:
            strs = simp_subordi_sent(tokens, node_list)
            if len(strs) > 0:
                return strs
            else:
                strs = simp_advcl_sent(tokens, node_list)
                if len(strs) > 0:
                    return strs
                else:
                    strs = simp_parti_sent(tokens, node_list)
                    if len(strs) > 0:
                        return strs
                    else:
                        strs = simp_adjec_sent(tokens, node_list)
                        if len(strs) > 0:
                            return strs
                        else:
                            strs = simp_appos_sent(tokens, node_list)
                            if len(strs) > 0:
                                return strs
                            else:
                                strs = simp_passive_sent(tokens, node_list)
                                if len(strs) > 0:
                                    return strs


    return strs
Exemple #6
0
def segment_en(texts, flag_keep_number=False):
    tk = StanfordTokenizer()
    results = {}
    for text in texts:
        if flag_keep_number:
            words = tk.tokenize(text)
        else:
            words = map(replace_number, tk.tokenize(text))
        segmented = ' '.join(words).lower()
        results[text] = segmented
    return results
Exemple #7
0
def Tokenize_stopwords_stemmer(texts):
    #print time()
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts = texts[0]
    print os.getcwd()
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar")
    texts_tokenized = tokenizer.tokenize(Str_texts)  #输入必须是字符串
    #print time()
    p2 = r'.+[-_\./"].+'
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_tokenized:
        if document in pa2.findall(document):
            if document.find('_') > -1:
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
        else:
            texts_filtered.append(document)
    #print time()
    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)
    p3 = r'.+">'
    pa3 = re.compile(p3)
    english_stopwords = stopwords.words('english')  #得到停词
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<',
        '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/',
        '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?'
    ]  #得到标点
    texts_filtered0 = []
    for document in texts_filtered:
        if document in pa1.findall(document) or document in pa3.findall(
                document
        ) or document == '' or document == "''" or document == "``" or document in english_stopwords or document in english_punctuations:
            pass
        else:
            texts_filtered0.append(document)
    #print time()

    porter = nltk.PorterStemmer()
    texts_Stemmered = [porter.stem(t) for t in texts_filtered0]  #列表类型
    #print time()

    return texts_Stemmered  #返回一个列表
Exemple #8
0
def readwordarr(isTokenize=True):
    posWords = []
    negWords = []
    stopwords = getstopword()
    if isTokenize:
        tokenizer = StanfordTokenizer()
        with open(negfilepath, 'r', encoding='utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding='utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    else:
        with open(negfilepath, 'r', encoding='utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding='utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    return posWords, negWords
def readwordarr(isTokenize = True):
    posWords = []
    negWords = []
    stopwords = getstopword()
    if isTokenize:
        tokenizer = StanfordTokenizer()
        with open(negfilepath, 'r', encoding = 'utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word) 
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding = 'utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))       
    else:
        with open(negfilepath, 'r', encoding = 'utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr=line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding = 'utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr=line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    return posWords,negWords
Exemple #10
0
class WordSegment(object):
    def __init__(self, user_dict=None):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.enTokenizer = StanfordTokenizer(
            path_to_jar=prefix + conf_tokenizer["path_to_jar"]
        )
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )

        # TODO:
        # 這裡要加上自定義字典

    def get_tokens(self, text):
        tokens = self.enTokenizer.tokenize(text)

        return self.en_tagger.tag(tokens)

    def get_new_words(self, text):
        pass
Exemple #11
0
    def dtm_builder(self, runType):
        '''
        desc: this function coordinates all activities of data processing
              and within the function tokenizes all elements of the sequence
        returns: a one hot encoded 3 dimensional matrix of training data and
                 testing data, which represents the next element in the
                 sequence
        '''
        if (runType=='training'):
            dataFiles = ['{}/{}'.format(DATA_DIR, file) for file in os.listdir(DATA_DIR) if file.endswith('.txt')]
            allTxt = '<eos>'.join([self._readFile(file) for file in dataFiles])
        elif (runType=='testing'):
            inputString = input('Enter test string: ')
            allTxt = inputString
            assert(type(allTxt)==str), 'input must be a string'

        allTxtTok = StanfordTokenizer().tokenize(allTxt)
        allTxt_allSeq = '||*||'.join(allTxtTok).split('<eos>')
        allTxt_bySeq = [seq.split('||*||') for seq in allTxt_allSeq]
        allTxt_bySeq = [list(filter(None, seq)) for seq in allTxt_bySeq]
        for seq in allTxt_bySeq: seq.append('<eos>')
        txtDocTokBySeqPad = self._padSeq(allTxt_bySeq)
        unqVoc_LookUp = self._buildVocLookUp(txtDocTokBySeqPad, runType)
        if(runType == 'training'):
            oheTrainData, oheTrainLabel = self._oneHotEncode(txtDocTokBySeqPad, unqVoc_LookUp, runType)
            return [oheTrainData, oheTrainLabel]
        else:
            oheTrainData = self._oneHotEncode(txtDocTokBySeqPad, unqVoc_LookUp, runType)
            return [oheTrainData, unqVoc_LookUp, inputString]
Exemple #12
0
def simp_syn_sent_(sent):
    strs = ""
    # the original tokens in the sent
    """
    lst1 = "Peter, who liked fruits, ate an apple.".split()
    _lst = sent.split()

    #import pdb; pdb.set_trace()
    if lst1 == _lst:
        return "Peter liked fruits. Peter ate an apple."
    """
    #import pdb; pdb.set_trace()
    #print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    #tokens = wordpunct_tokenize(str(sent))
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']


    #import pdb; pdb.set_trace()
    #w = result.tree()
    #print "parse_tree:", w
    #for row in result.triples():
    #    print(row)


    #import pdb; pdb.set_trace()
    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)


    #import pdb; pdb.set_trace()
    #strs = simp_coordi_sent(tokens, node_list)
    #strs = simp_subordi_sent(tokens, node_list)
    #strs = simp_advcl_sent(tokens, node_list)
    #strs = simp_parti_sent(tokens, node_list)
    strs = simp_adjec_sent(tokens, node_list)
    #strs = simp_appos_sent(tokens, node_list)
    #strs = simp_passive_sent(tokens, node_list)

    return strs
Exemple #13
0
def data():
    with open("wonderland.txt", "r", encoding="utf-8-sig") as file:
        words = StanfordTokenizer(r"C:\stanford-postagger-2016-10-31\stanford-postagger.jar") \
            .tokenize(file.read().lower())
    voc_list = sorted(set(words))
    vocabulary = dict(zip(voc_list, itertools.count()))
    words_idx = [vocabulary[word] for word in words]
    return voc_list, vocabulary, words_idx
Exemple #14
0
class POSTagger:
    """POSTagger creates a POS tagger for german language. Different tagger are available to use."""
    STAN = "stanford-hgc-tagger"
    SFT = "stanford-fast-tagger"
    TT = "tree-tagger"
    SPACY = "spacy-tagger"

    # paths to Stanford tagger modules
    __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar"
    __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/"

    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
        else:
            raise Exception("Wrong tagger parameter.")

    def tag(self, text):
        """POS tag tokenized text."""
        if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN:
            tokens = self.__tokenizer.tokenize(text)
            return self.__tagger.tag(tokens)
        elif self.tagger_name == POSTagger.TT:
            tags = self.__tagger.tag_text(text)
            tuple_list = []
            tag_list = treetaggerwrapper.make_tags(tags)
            for item in tag_list:
                tuple_list.append((item[0], item[1]))
            return tuple_list
        elif self.tagger_name == POSTagger.SPACY:
            tags = self.__tagger(text)
            tuple_list = []
            for word in tags:
                tuple_list.append((word.orth_, word.tag_))
            return tuple_list
        else:
            pass

#tagger = POSTagger("spacy-tagger")
#doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.")
#print(tagger.tag("Ich werde morgen in die Schule gehen."))
#print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
Exemple #15
0
def instance_tokenizer(language, stanfordpath=None):
    # you can add more params or kinds of tokenizer from here:
    # http://www.nltk.org/api/nltk.tokenize.html
    if stanfordpath:
        tok = StanfordTokenizer(path_to_jar=stanfordpath)
    else:
        tok = WordPunctTokenizer()

    return tok
def tokenize_stopwords_stemmer(texts):
    Str_texts = texts[0]
    # tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(
        path_to_jar=r"C:\Users\zw\Desktop\stanford-parser.jar")  # path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子
    java_path = 'E:soft/Java/jdk1.8.0_121/bin/java.exe'
    os.environ['JAVAHOME'] = java_path
    texts_tokenized = tokenizer.tokenize(Str_texts)  # 输入必须是字符串,进行分词
    # print(texts_tokenized)

    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)  # re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例)
    texts_filtered0 = [document for document in texts_tokenized if not document in pa1.findall(document)]

    p2 = r'.+[-_\/].+'  # 将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_') > -1:  # split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list)
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
            elif document.find('/') > -1:
                texts_filtered = texts_filtered + document.split('/')
        else:
            texts_filtered.append(document)

    texts_filtered = [document for document in texts_filtered if
                      document != '' and document != "''" and document != "``"]  # 过滤掉空格,单引号和--

    # # stopwords
    # english_stopwords =stopwords.words('english')  # 得到停词
    # texts_filtered_stopwords = [document for document in texts_filtered if not document in english_stopwords]  # 过滤掉停词

    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '||',
                            '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '0', '$', '^', '/*', '*/', '/**', '**/',
                            '**', '-', '_', '__', '|', '+', '=', r'-?-', r'@?']  # 得到标点

    texts_filtered = [document for document in texts_filtered if
                      not document in english_punctuations]  # 过滤掉标点
    return texts_filtered
    def __init__(self, task_queue, result_queue):

        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue

        self.tokenizer = StanfordTokenizer(options={"ptb3Escaping": True})
        print '%s: Loading pickles...' % self.name
        self.map_word_index = map_word_index_model
        print '%s: Done.' % self.name
Exemple #18
0
    def __init__(self, user_dict=None):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.enTokenizer = StanfordTokenizer(
            path_to_jar=prefix + conf_tokenizer["path_to_jar"]
        )
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )
Exemple #19
0
def data():
    with open("wonderland.txt", "r", encoding="utf-8-sig") as file:
        return [
            word.lower() for word in StanfordTokenizer(
                path_to_jar=
                r"C:\stanford-postagger-2016-10-31\stanford-postagger.jar",
                options={
                    "normalizeParentheses": "false",
                    "normalizeOtherBrackets": "false"
                }).tokenize(file.read())
        ]
Exemple #20
0
def simp_syn_sent_(sent):
    strs = ""
    #print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    #tokens = wordpunct_tokenize(str(sent))
    tokens.insert(0, '')

    re = list(eng_parser.raw_parse(sent))[0]
    root = re.root['word']

    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in re.nodes.items():
        node_list.append(base.get_triples(node))

    #result = list(eng_parser.raw_parse(sent))[0]
    #root = result.root['word']

    strs = simp_relcl_sent(tokens, node_list)

    return strs
Exemple #21
0
def tokenize(text_list,
             clean_html=False,
             tokenizer="twitter",
             remove_reps=True,
             spell_correct=True):
    if tokenizer == "stanford":
        tolkenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tolkenizer_obj = TweetTokenizer()
    else:
        tolkenizer_obj = StringTokenizer()

    token_list = []
    for text in text_list:
        if clean_html:
            text = BeautifulSoup(text).get_text()
        if remove_reps:
            text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        tokens = tolkenizer_obj.tokenize(text)
        if spell_correct:
            tokens = [spell(t) for t in tokens]
        token_list.append(tokens)
    return token_list
def stanford_tokenizer(str):

    tokenizer = StanfordTokenizer(
        path_to_jar=
        'D:/software/stanford-parser-full-3.7/stanford-parser-3.7.0-models.jar'
    )

    # sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    return tokenizer.tokenize(str)


# if __name__=='__main__':
#     sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
#     result = stanford_tokenizer(sent)
#     print(result)

# st = StanfordPOSTagger('english-bidirectional-distsim.tagger')

# from nltk.tokenize import StanfordTokenizer
# s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
# StanfordTokenizer().tokenize(s)
# s = "The colour of the wall is blue."
# StanfordTokenizer(options={"americanize": True}).tokenize(s)
Exemple #23
0
    def __init__(self, **kwargs):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_segmenter = self.conf_corenlp["segmenter"]
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.segmenter = StanfordSegmenter(
            path_to_jar=prefix + conf_segmenter["path_to_jar"],
            path_to_sihan_corpora_dict=prefix +
            conf_segmenter["path_to_sihan_corpora_dict"],
            path_to_model=prefix + conf_segmenter["path_to_model"],
            path_to_dict=prefix + conf_segmenter["path_to_dict"],
            path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"],
            encoding=conf_segmenter["encoding"])
        self.enTokenizer = StanfordTokenizer(path_to_jar=prefix +
                                             conf_tokenizer["path_to_jar"])
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.frequency = defaultdict(int)
        pynlpir.open()
        pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"],
                                     Overwrite=False)

        try:
            self.excluded_docs = kwargs["excluded_docs"]
        except:
            self.excluded_docs = [""]

        # experimental features
        self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"]
Exemple #24
0
def _get_sentence_embeddings(sentences,
                             ngram='bigrams',
                             model='concat_wiki_twitter'):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None

    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
        tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
        s = ' <delimiter> '.join(
            sentences)  #just a trick to make things faster
        tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
        tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(
            ' <delimiter> ')
        assert (len(tokenized_sentences_SNLP) == len(sentences))
        if ngram == 'unigrams':
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter" or model == 'concat_wiki_twitter':
        tknzr = TweetTokenizer()
        tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
        if ngram == 'unigrams':
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter":
        return twitter_embbedings
    elif model == "wiki":
        return wiki_embeddings
    elif model == "concat_wiki_twitter":
        return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)
Exemple #25
0
def get_sentence_embeddings(sentences, ngram='uni'):
    tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
    s = ' <delimiter> '.join(sentences)  #just a trick to make things faster
    tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
    tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(
        ' <delimiter> ')
    if len(tokenized_sentences_SNLP) != len(sentences):
        print('SENT2VEC TOKENIZATION FAILED')
        tokenized_sentences_SNLP = sentences
    #assert(len(tokenized_sentences_SNLP) == len(sentences))
    if ngram == 'uni':
        embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                 MODEL_TORONTOBOOKS_UNIGRAMS, FASTTEXT_EXEC_PATH)
    elif ngram == 'bi':
        embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                 MODEL_TORONTOBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH)
    else:
        raise (NotImplementedError)
    return embeddings
Exemple #26
0
    def __init__(self, tagger):
        """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger."""
        self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar)
        if tagger == POSTagger.STAN:
            self.tagger_name = POSTagger.STAN
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-hgc.tagger")
        elif tagger == POSTagger.SFT:
            self.tagger_name = POSTagger.SFT
            self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar,
                                              model_filename=POSTagger.__model_file_name + "german-fast.tagger")
        elif tagger == POSTagger.TT:
            self.tagger_name = POSTagger.TT
            self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')

        # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards
        elif tagger == POSTagger.SPACY:
            self.tagger_name = POSTagger.SPACY
            self.__tagger = spacy.load('de')
        else:
            raise Exception("Wrong tagger parameter.")
Exemple #27
0
def par_tokenize(text_list,
                 clean_html=False,
                 tokenizer="twitter",
                 remove_reps=True,
                 spell_correct=True):
    if tokenizer == "stanford":
        tolkenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tolkenizer_obj = TweetTokenizer()
    else:
        tolkenizer_obj = StringTokenizer()

    import multiprocessing as mp
    from functools import partial
    pool = mp.Pool(NUM_PROC)
    tolkenize_func = partial(__tolkenize_text_blob,
                             clean_html=clean_html,
                             remove_reps=remove_reps,
                             spell_correct=spell_correct,
                             tolkenizer_obj=tolkenizer_obj)
    token_list = pool.map(tolkenize_func, text_list)
    return token_list
def tokenize_and_save_corpus(corpus_filename, new_filename):
    with open(corpus_filename, 'r') as f:
        corpus_str = f.read()
    tokenized = StanfordTokenizer().tokenize(corpus_str)
    lowered = [w.lower() for w in tokenized]

    num = r'(?<!\S)(\d*\.?\d+|\d{1,3}(,\d{3})*(\.\d+)?)(?!\S)'
    number_words = {}
    new_words = []
    for word in lowered:
        if word in number_words:
            new_words.extend(number_words[word])
        else:
            numbers = re.findall(num, word)
            if numbers:
                number = numbers[0][0]
                nwords = word_numbers(number)
                number_words[word] = nwords
                new_words.extend(nwords)
            else:
                new_words.append(word)
    with open(new_filename, 'w') as f:
        f.write(' '.join(new_words).encode('utf-8'))
def tokenize(text_rdd,
             clean_html=False,
             tokenizer="twitter",
             remove_reps=True,
             spell_correct=True):
    if tokenizer == "stanford":
        tokenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tokenizer_obj = TweetTokenizer()
    else:
        tokenizer_obj = StringTokenizer()
    print("Processing {} tokns".format(text_rdd.count()))

    if (remove_reps):
        text_rdd = text_rdd.map(
            lambda text: re.sub(r'(.)\1{2,}', r'\1\1', text))
    if clean_html:
        text_rdd = text_rdd.map(lambda text: BeautifulSoup(text).get_text())
    tokens_rdd = text_rdd.map(lambda text: TweetTokenizer().tokenize(text))
    if spell_correct:
        tokens_rdd = tokens_rdd.map(lambda tokens: [spell(t) for t in tokens])
        #tokens_rdd = tokens_rdd.map(lambda tokens: [t for t in tokens])

    return tokens_rdd
import argparse
from nltk.tokenize import StanfordTokenizer


aparser = argparse.ArgumentParser(
    description="Run CoreNLP tokenizer on a TSV definition file")
aparser.add_argument(
    'input_filepath', type=str, help='input file path')
aparser.add_argument(
    'output_filepath', type=str, help='output file path')
aparser.add_argument(
    'corenlp_postagger_path', type=str, help="path to stanford-postagger.jar")

opt = aparser.parse_args()
tokenizer = StanfordTokenizer(path_to_jar=opt.corenlp_postagger_path,
                              options={"ptb3Escaping": "false",
                                       "tokenizePerLine": "true",
                                       "tokenizeNLs": "true"})
entries = []
definitions = []
with open(opt.input_filepath) as ifp:
    for line in ifp:
        parts = line.strip().split('\t')
        entries.append(parts[:-1])
        definitions.append(parts[-1])
def_str = "\n".join(definitions)
tokens = tokenizer.tokenize(def_str)
def_str = " ".join(tokens)
definitions = def_str.split("*NL*")
with open(opt.output_filepath, 'w') as ofp:
    for entry, definition in zip(entries, definitions):
        ofp.write("{}\t{}\n".format('\t'.join(entry), definition.strip()))
    elif token == '-RCB-':
        token = '}'
    return token


def tokenize_sentences(tknzr, sentences, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentences: a list of sentences
        - to_lower: lowercasing or not
    """
    return [tokenize(tknzr, s, to_lower) for s in sentences]


fileName = sys.argv[1]

SNLP_TAGGER_JAR = "/home/pgupta/stanford-postagger.jar"

sentences = []
with open(fileName, 'r') as fileinput:
    for line in fileinput:
        sentences.append(line)

tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
s = ' <delimiter> '.join(sentences)
tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')

for sentence in tokenized_sentences_SNLP:
    print(sentence)
Exemple #32
0
def relcl(sent):
    strs = ""
    #print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    #tokens = wordpunct_tokenize(str(sent))
    tokens.insert(0, '')

    re = list(eng_parser.raw_parse(sent))[0]
    root = re.root['word']

    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in re.nodes.items():
        node_list.append(base.get_triples(node))

    #result = list(eng_parser.raw_parse(sent))[0]
    #root = result.root['word']

    #strs = simp_relcl_sent(tokens, node_list)

    dep = eng_parser.raw_parse(sent).next()
    result = list(dep.triples())

    nsubj = ""
    verb = ""
    for node in result:
        if 'acl:relcl' in node[1]:
            #import pdb; pdb.set_trace()
            nsubj = node[0][0]
            verb =  node[2][0]
        #break


    #import pdb; pdb.set_trace()
    nsubj_ind = tokens.index(nsubj)
    verb_ind = tokens.index(verb)

    #split_ind = tokens.index(COMMA)
    #import pdb; pdb.set_trace()

    #if split_ind < verb_ind:
    _str1 = tokens[:nsubj_ind+1]
    str1 = ' '.join(_str1) + " . "

    _str2 = tokens[nsubj_ind+1:]
    if _str2[0] in PUNCTUATION:
        _str2.pop(0)
    if ('which' in _str2[0]) or ('who' in _str2[0]):
        _str2.pop(0)

    str2 = base.replace_nsubj(tokens, nsubj) +  ' '.join(_str2)

    strs = str1 + str2
    """
    stree = [parse.tree() for parse in eng_parser.raw_parse(sent)][0]

    #import pdb; pdb.set_trace()
    for postn in stree.treepositions():
        if stree.label().endswith("=H"):
            parentpos = postn[:-1]
            partial = Tree(stree[parentpos].label(), [ stree[postn] ])
    """

    #import pdb; pdb.set_trace()
    #strs = simp_relcl_sent(result)

    """
    lst = []
    se = 0
    head = ""
    dependent = ""
    for nd in re:
        if 'nsubj' in nd[1] or 'nsubjpass' in nd[1]:
            head = nd[0][0]
            dependent = nd[2][0]
    """

    #for node in node_list[1:]:

    return strs
Exemple #33
0
 def __init__(self, classifier, jar_file, field_to_process, output_field):
     self.classifier = classifier
     self.jar_file = jar_file
     self.field_to_process = field_to_process
     self.output_field = output_field
     self.tokenizer = StanfordTokenizer(path_to_jar=self.jar_file).tokenize
 #lines = f.read().encode('utf-8').split('</text>')
 #for index, line in enumerate(lines):
 # remove leading and trailing whitespace
 lines = file.encode('utf-8').split('</text>')
 for line in lines:
     newline = ''
     try:
         if "<text xml:space=\"preserve\">" in line and "#REDIRECT" not in line:
             newline = line[line.find("<text xml:space=\"preserve\">") +
                            len("<text xml:space=\"preserve\">"):]
             if guess_language(newline) == 'en':
                 s = re.sub(
                     '[^A-Za-z0-9\s.,\'\";?$%+-:!]+', '@',
                     re.sub('\d', '0', newline).replace('[', ' ').replace(
                         ']', ' ').replace('}', ' ').replace('{', ' '))
                 s2 = StanfordTokenizer().tokenize(s)
                 s3 = [word.encode('ascii') for word in s2]
                 charCounter = 0
                 tokenCounter = 0
                 sentStart = 0
                 deleteThese = []
                 for index, token in enumerate(s3):
                     if token == '.':
                         if charCounter < 20 or tokenCounter < 5:
                             deleteThese.append([sentStart, index])
                         charCounter = 0
                         tokenCounter = 0
                         sentStart = index + 1
                     else:
                         charCounter += len(token)
                         tokenCounter += 1
# -*- coding: utf-8 -*-
from nltk.tokenize import StanfordTokenizer
import time


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


last_time = time.time()
line_buffer = ''
with open('WestburyLab.Wikipedia.Corpus.txt') as infp, open(
        'TokenizedCorpus.txt', 'w') as outfp:
    for e, line in enumerate(infp):
        if (e + 1) % 10000 == 0:
            line_buffer = StanfordTokenizer().tokenize(line_buffer)
            try:
                outfp.write(' '.join(line_buffer) + '\n')
            except:
                for i in xrange(len(line_buffer)):
                    if not is_ascii(line_buffer[i]):
                        line_buffer[i] = '<UNK>'
                outfp.write(' '.join(line_buffer) + '\n')
            line_buffer = ''
            print e + 1, '/ 30749930', float(
                e + 1) / 30749930, time.time() - last_time

        if line.strip() == '':
            continue
        line_buffer += (line + ' <br> ')
Exemple #36
0
java_path = "C:/Program Files/Java/jre1.8.0_131/bin/java.exe"
parser_path = "D:/stanford-parser-full-2016-10-31/stanford-parser.jar"
models_path = "D:/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
engPCFG_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"

import os
os.environ['JAVA_HOME'] = java_path

from nltk.tokenize import StanfordTokenizer
tokenizer = StanfordTokenizer(parser_path)

from nltk.parse.stanford import StanfordDependencyParser
parser = StanfordDependencyParser(parser_path, models_path, engPCFG_path)

from nltk.corpus import wordnet

import nltk
from nltk.tree import Tree
from nltk.corpus.reader.wordnet import Lemma
from nltk.corpus import semcor
from nltk.corpus import wordnet

noun = set(['NN', 'NNS', 'NNP', 'NNPS'])
verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
adjective = set(['JJ', 'JJR', 'JJS'])
adverb = set(['RB', 'RBR', 'RBS'])
substantive = noun | verb | adjective | adverb

corp = semcor.sents()

tags = semcor.tagged_sents(tag = 'sem')
Exemple #37
0
def _simp_syn_sent(sent, _algs=range(1,10)):
    strs = ""
    
    """ 
    # order the ALG for the better performance(precision/recall)
    _algs_lst_ = [
        paratax.simp_paratax_sent,
        #punct.simp_punct_sent,
        subordi.simp_subordi_sent,
        adverb.simp_adverb_sent,
        parti.simp_parti_sent,
        appos.simp_appos_sent,
        adjec.simp_adjec_sent,
        coordi.simp_coordi_sent,
        passive.simp_passive_sent
    ]
    """
    # the original tokens in the sent
    #print "syn sent: ", sent
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(sent)
    #tokens = wordpunct_tokenize(strs)
    tokens.insert(0, '')
    #taggers = eng_tagger.tag(sent.split())

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w
    
    #TODO: use the tree structure, Check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)

    alg = ""
    """
    #import pdb; pdb.set_trace()
    if len(sent) > 0:
        for ind in _algs:
            #import pdb; pdb.set_trace()
            # if the alg in the choices
            print "_alg: ", _algs_lst[ind]
            if len(strs) > 0:
                return strs, _algs_lst[ind]
            else:
                #func = _algs_lst[ind]
                strs = _algs_lst[ind](tokens,node_list)
    """           
    # Use the robest function for the experiments                  
    if len(sent) > 0: 
        strs = paratax.simp_paratax_sent(tokens, node_list)
        if len(strs) > 0:
            alg = "paratax"
            return strs, alg
        else:
            strs = punct.simp_punct_sent(tokens, node_list)
            if len(strs) > 0:
                alg = "punct"
                return strs, alg
            else:    
                #strs = coordi.simp_coordi_sent(tokens, node_list)                   
                strs = subordi.simp_subordi_sent(tokens, node_list)
                if len(strs) > 0:
                    alg = "subordi"
                    return strs, alg
                else:
                    strs = adverb.simp_adverb_sent(tokens, node_list)
                    if len(strs) > 0:
                        alg = "adverb"
                        return strs, alg
                    else:
                        strs = parti.simp_parti_sent(tokens, node_list)
                        if len(strs) > 0:
                            alg = "parti"
                            return strs, alg
                        else:
                            strs = appos.simp_appos_sent(tokens, node_list)
                            if len(strs) > 0:
                                alg = "appos"
                                return strs, alg
                            else:
                                strs = adjec.simp_adjec_sent(tokens, node_list)
                                if len(strs) > 0:
                                    alg = "adjec"
                                    return strs, alg
                                else:
                                    #strs = subordi.simp_subordi_sent(tokens, node_list)
                                    strs = coordi.simp_coordi_sent(tokens, node_list)
                                    if len(strs) > 0:
                                        alg = "coordi"
                                        return strs, alg
                                    else:
                                        
                                        strs = passive.simp_passive_sent(tokens, node_list)
                                        if len(strs) > 0:
                                            alg = "passive"
                                            return strs, alg
                                        else:
                                            strs = relcl.simp_relcl_sent(tokens, node_list)
                                            if len(strs) > 0:
                                                alg= "relcl"
                                                return strs, alg

    return strs, alg
Exemple #38
0
from nltk.tag.stanford import StanfordNERTagger, StanfordPOSTagger
from nltk.tokenize import StanfordTokenizer
from wordsegment import load, segment

CUR_DIRECTORY = '/home/wmq/Desktop/DeepText/StanfordNLP'
SEGMENT_PATH = CUR_DIRECTORY + '/stanford-segmenter-3.8.0.jar'
NER_MODEL_PATH = CUR_DIRECTORY + '/english.all.3class.distsim.crf.ser.gz'
NER_JAR_PATH = CUR_DIRECTORY + '/stanford-ner.jar'
POS_MODEL_PATH = CUR_DIRECTORY + '/english-left3words-distsim.tagger'
POS_JAR_PATH = CUR_DIRECTORY + '/stanford-postagger.jar'

ner_tagger = StanfordNERTagger(NER_MODEL_PATH, NER_JAR_PATH, java_options='')
pos_tagger = StanfordPOSTagger(POS_MODEL_PATH, POS_JAR_PATH, java_options='')
tokenizer = StanfordTokenizer(SEGMENT_PATH)
load()

s = "@user nah pretty sure it's jackson's great jokes"
ws = tokenizer.tokenize(s)
print(' '.join(ws))
# print (' '.join(segment('#happythankgiving')))
# s = 'i got to to go formal with my best friend @ phi mu at jsu'.split()
# ner_sent = ner_tagger.tag(s)
# pos_sent = pos_tagger.tag(s)
# print (ner_sent)
# print (pos_sent)
Exemple #39
0
java_path = "C:/Program Files/Java/jre1.8.0_131/bin/java.exe"
parser_path = "D:/stanford-parser-full-2016-10-31/stanford-parser.jar"
models_path = "D:/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
engPCFG_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"

import os
os.environ['JAVA_HOME'] = java_path

import sys

from nltk.tokenize import StanfordTokenizer
tokenizer = StanfordTokenizer(parser_path)

from nltk.parse.stanford import StanfordDependencyParser
parser = StanfordDependencyParser(parser_path, models_path, engPCFG_path)

from nltk.corpus import wordnet

import nltk
from nltk.tree import Tree
from nltk.corpus.reader.wordnet import Synset
from nltk.corpus import semcor
from nltk.corpus import wordnet
from nltk.wsd import lesk

noun = set(['NN', 'NNS', 'NNP', 'NNPS'])
verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
adjective = set(['JJ', 'JJR', 'JJS'])
adverb = set(['RB', 'RBR', 'RBS'])
substantive = noun | verb | adjective | adverb
            else:
                result.append([k1,k2])
                i = i+2
        else:
            i=i+1
    return result


if __name__ == '__main__':#very important
    # res = request([["excellent"],["poor"]])
    poshit = 1510000000032
    neghit = 771000000037 
    print(poshit)
    print(neghit)
    stopword = ["-LSB-","-RSB-","-LRB-","-RRB-"]              
    tokenizer = StanfordTokenizer()
    filename = "F:/course/sentimentcode/rt-polarity.neg"
    file_object = codecs.open(filename,'r','utf-8')
    allres = []
    try:
        all_the_text = file_object.read()
        arr = tokenizer.tokenize(all_the_text)
        la = len(arr)
        correct = 0
        for line in arr:
            ax = line.split()
            wordarr = []
            for word in ax:
                if word in stopword:
                    continue
                wordarr.append(word)