コード例 #1
0
def add_data(path='./data'):
    list_json = os.listdir(path)
    for file_name in list_json:
        paths = os.path.join(path, file_name)
        with open(paths) as json_file:
            data = json.load(json_file)
            data = list(data)
            for field in data:
                field["content"] = ViTokenizer.tokenize(
                    field["content"]) if field['content'] else 'nothing'
                field["title"] = ViTokenizer.tokenize(
                    field["title"]) if field['title'] else 'nothing'
                field["description"] = ViTokenizer.tokenize(
                    field["description"]
                ) if field['description'] else 'nothing'
                field["topic"] = ViTokenizer.tokenize(
                    field["topic"]) if field['topic'] else 'nothing'
                field["author"] = field["author"].strip().replace(
                    ' ', '_') if (field['author']
                                  and field['author'].strip()) else 'unknown'
                field['publish_date'] = field['publish_date'] if field[
                    'publish_date'] else 'unknown'

            solr.add(data)
    return jsonify("OK")
コード例 #2
0
    def tokenizer(self, item):

        # tokenizer
        title_token = self.removeStopwords(
            re.sub("(\d+(\,*\.*\d+)+)", "_NUMBER",
                   ViTokenizer.tokenize(item['title'])))
        description_token = self.removeStopwords(
            re.sub("(\d+(\,*\.*\d+)+)", "_NUMBER",
                   ViTokenizer.tokenize(item['description'])))

        # word count
        item['title_wc'] = {
            word: title_token.count(word)
            for word in title_token
        }
        item['description_wc'] = {
            word: description_token.count(word)
            for word in description_token
        }

        # remove col
        self.list_key = ['title_wc', 'square', 'price', 'description_wc']
        for key in list(item.keys()):
            if key not in self.list_key:
                del item[key]

        return item
コード例 #3
0
def make_w2vec_matrix(question, paragraph, model=word2vec):
    train_question = preprocess_sentence(question)
    train_answers = preprocess_sentence(paragraph)
    tokens_question = ViTokenizer.tokenize(train_question).split()
    tokens_answer = ViTokenizer.tokenize(train_answers).split()
    question_embs = []
    answer_embs = []
    for i in range(len(tokens_question)):
        if tokens_question[i] in model:
            question_embs.append(model[tokens_question[i]])
        else:
            question_embs.append(model['unknown'])
    for i in range(len(tokens_answer)):
        if tokens_answer[i] in model:
            answer_embs.append(model[tokens_answer[i]])
        else:
            answer_embs.append(model['unknown'])
    question_embs = np.array(question_embs)
    answer_embs = np.array(answer_embs)
    """
	if question_embs.shape[0] < MIN_LENGTH_QUESTION:
	question_embs = np.pad(question_embs, ((4,4), (0,0)))
	"""

    if answer_embs.shape[0] < MIN_LENGTH_ANSWER:
        paddings = np.ceil(MIN_LENGTH_ANSWER / answer_embs.shape[0])
        d = np.copy(answer_embs)
        for i in range(int(paddings)):
            answer_embs = np.concatenate((answer_embs, d))

    return question_embs, answer_embs
コード例 #4
0
def predict_articles(articles):
    for article in articles:
        gensim.utils.simple_preprocess(article)
        ViTokenizer.tokenize(article)

    content_data_tfidf = tfidf_vector.transform(articles)
    prediction = trained_model.predict(content_data_tfidf)
    return prediction
コード例 #5
0
 def __get_keywords_from_text(text):
     tokens = ViTokenizer.tokenize(text)
     tokens = ViTokenizer.spacy_tokenize(tokens)[0]
     tokens = list(filter(lambda x: len(x) > 1, tokens))
     counter_tokens = Counter(tokens)
     counter_tokens = dict(counter_tokens)
     counter_tokens = dict(
         sorted(counter_tokens.items(), key=lambda x: -x[1]))
     return counter_tokens
コード例 #6
0
def searchDongNghia(sentence):

    content = ViTokenizer.tokenize(sentence)

    list_word = content.split()

    stopwords = []
    f = open('vietnamese-stopwords.txt', 'r')
    for line in f:
        line = line.rstrip()
        #print(line)
        line = line.replace(' ', '_')

        stopwords.append(line)
    f.close()

    content = ViTokenizer.tokenize(sentence)

    list_word = content.split()

    words = []
    for word in list_word:
        word = word.lower()
        if word not in stopwords:
            words.append(word)

    model = gensim.models.KeyedVectors.load_word2vec_format(
        'model/baomoi.model.bin', binary=True)

    N = 3

    list_dongnghia = []

    for word in words:

        # kiểm tra nếu word có trong từ điển ko, nếu có thì lấy tra N từ đồng nghĩa với từ đó

        dongnghia = model.wv.most_similar(positive=[word], topn=N)

        for i in range(0, N):

            list_dongnghia.append(dongnghia[i][0].replace('_', ' '))
            #print(list_dongnghia)

    results = []
    for word in list_dongnghia:
        key = "description : " + "\"" + sentence + "\""

        result = solr.search(key)

        results.append(result)

    return results
コード例 #7
0
ファイル: chatbot.py プロジェクト: nguyentu43/simple_chat_bot
    def convert_st_to_bow(self, st):
        bow = [0] * len(self.words)
        tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st))

        if not (len(tagger[1]) == 1 and tagger[1][0] == 'Np'
                and tagger[0][0] not in SKIP_WORDS):
            tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st.lower()))

        for i, j in enumerate(tagger[1]):
            if j in REPLACE:
                tagger[0][i] = REPLACE[tagger[1][i]]
            if tagger[0][i] in self.words:
                bow[self.words.index(tagger[0][i])] = tagger[0].count(
                    tagger[0][i])
        return np.array(bow)
コード例 #8
0
def makeSummary(sentences, n, scores):

    sentences = sorted(sentences, key=lambda x: x.getScore(), reverse=True)
    summary = []
    i = 0
    length_summary = len(
        ViTokenizer.tokenize(sentences[i].getOriginalWords().strip()).split())
    while (length_summary < n):
        i += 1
        summary += [sentences[i]]
        length_summary += len(
            ViTokenizer.tokenize(
                sentences[i].getOriginalWords().strip()).split())

    return summary
コード例 #9
0
def segment_tree(tree):
    for par in tree.findall(".//paragraph"):
        par_text = ""
        par_original_text = ""
        char_attrib_in_par = []
        char_in_par = []
        count_line = 0
        for lines in par:
            count_line += 1
            t = ''
            for c in lines:
                if (c.text is None) or (c.text == "\n"):
                    # t += r" "
                    c.text = " "
                if (c.text == ''):
                    continue
                t += c.text
                par_original_text += c.text
                char_attrib_in_par.append(c)
            par_text = par_text + " " + t[:-1].strip()
        if (len(par) > 1):
            par_text = ViTokenizer.tokenize(par_text).replace("_", " ").replace("-", "").replace("+", "")
            list_sentences = segmenter.segment_long(par_text.strip(), n_window=10)
        else:
            list_sentences = [par_text.strip()]

        search_idx = 0
        for i in range(len(list_sentences)):
            sentence = etree.Element("sentence")

            list_word = ViTokenizer.tokenize(list_sentences[i]).split()
            lookup = link_coord(par_original_text, list_word, search_idx)
            if lookup:
                j = 0
                while (j < len(lookup)):
                    word = etree.Element("word")
                    start_word_idx = lookup[j]
                    end_word_idx = lookup[j + 1]
                    for idx in range(start_word_idx, end_word_idx + 1):
                        word.append(char_attrib_in_par[idx])
                    sentence.append(word)
                    j += 2
                # print(s_t)
                search_idx = lookup[-1] + 1
                par.append(sentence)
    for layout in tree.findall(".//textline"):
        layout.getparent().remove(layout)
    return tree
コード例 #10
0
def PreprocessingData(i):
    # i = i.strip(SPECIAL_CHARACTER)
    my_words = i.split(" ")
    for word in i:
        if word in SPECIAL_CHARACTER:
            # print(word)
            i = i.replace(word, "")
            i = i.replace("  ", " ")
            # print(i)
    for word in my_words:
        if len(word) > 20 :
            # print(word)
            i = i.replace(word, "")
            i = i.replace("  ", " ")
            # print(i)

    i = ViTokenizer.tokenize(i)
    my_words = i.split(" ")
    # print(i)
    for word in my_words:
        # print(word)
        if word in STOP_WORDS:
            print(word)
            i = i.replace(word, "")
            i = i.replace("  ", " ")
            # print(i)
    i = i.lower()
    # print(i)
    return i
コード例 #11
0
 def transform(self, X, y=None, **fit_params):
     result = [ViTokenizer.tokenize(text.lower()) for text in X]
     return [
         " ".join([
             token for token in text.split() if token not in self.stopwords
         ]) for text in result
     ]
コード例 #12
0
    def handle(self):

        newData = []

        for v in self.data:
            t = v[self.content].lower()

            if self.html:
                t = BeautifulSoup(t, 'html.parser').get_text()

            # Chuẩn hóa láy âm tiết
            t = re.sub(r'(\D)\1+', r'\1', t)

            # Tách từ
            t = ViTokenizer.tokenize(t)

            if self.accented_char:
                t = unicodedata2.normalize('NFD',
                                           t).encode('ascii',
                                                     'ignore').decode("utf-8")

            if self.special_char:
                t = [x.strip(SPECIAL_CHARACTER) for x in t.split()]

            if self.stopwords:
                t = [word for word in t if word not in self.list_stopword]

            v[self.content] = t
            if v not in newData:
                newData.append(v)

        print(np.array(newData))
コード例 #13
0
def text_postag(text):
    pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(text))
    dict_tag = {}
    for i in range(len(pos_tag[0])):
        dict_tag[pos_tag[0][i]] = pos_tag[1][i]

    return dict_tag
コード例 #14
0
def word_segment(sent):
    '''
    Args:
      sent: A string. A sentence.
    
    Returns:
      A list of words.
    '''
    global lcode
    if lcode in ['ko']:
        words = [word for word, _ in kkma.pos(sent)]
    elif lcode in ['ja']:
        words = mecab.parse(sent.encode('utf8')).split() 
    elif lcode in ['th']:
        words = pythai.split(sent)
    elif lcode in ['vi']:
        words = ViTokenizer.tokenize(sent).split()        
    elif lcode in ['zh']:
        words = list(jieba.cut(sent, cut_all=False)) 
#     elif lcode in ['ar']:
#         words = segmenter.segment(sent).split()
    else: # Mostly european languages
        words = sent.split()
    
    return words
コード例 #15
0
ファイル: Dataset.py プロジェクト: latruonghai/query
 def tokenize(self):
     dem = 1
     len1, len2 = (0, 0)
     start = time()
     for folder, subfolder, file in os.walk(self.pathToTxt):
         if len(file) == 0:
             continue
         else:
             for fi in file:
                 new_path = os.path.join(folder, fi)
                 with open(new_path, 'r') as f:
                     content = f.read()
                     content = self.removeCommas(content)
                     len1 += len(content)
                     pos = ViTokenizer.tokenize(content)
                     new_text = self.removeStopWord(pos)
                     new_text = ' '.join(new_text)
                     len2 += len(new_text)
                 path_to_save = 'dataset' + str(dem) + '.txt'
                 path_to_dataset = os.path.join(self.dataset_path,
                                                path_to_save)
                 with open(path_to_dataset, 'w+') as f:
                     f.write(new_text)
                 dem += 1
     end_time = time() - start
     print('Done in {}s, with {}% change'.format(end_time,
                                                 (len2 / len1) * 100))
コード例 #16
0
def get_data(folder_path, mode=None):
    type_data = folder_path.split('/')[-1].split('_')[0].lower()

    if mode is None:
        X = []
        y = []
        dirs = os.listdir(folder_path)
        print(dirs)
        for path in dirs:
            file_paths = os.listdir(os.path.join(folder_path, path))
            for file_path in tqdm(file_paths, desc=path):
                with open(os.path.join(folder_path, path, file_path),
                          'r',
                          encoding='utf-16') as f:
                    lines = f.readlines()
                    lines = ' '.join(lines)
                    lines = ViTokenizer.tokenize(lines)
                    lines = gensim.utils.simple_preprocess(
                        lines)  # remove symbols
                    lines = ' '.join(lines)

                    X.append(lines)
                    y.append(path)

    elif mode == 'from_file':
        with open('./data/X_' + type_data + '.pkl', 'rb') as f:
            X = pickle.load(f)

        with open('./data/y_' + type_data + '.pkl', 'rb') as f:
            y = pickle.load(f)

    return X, y
コード例 #17
0
def clean_text(text, stopwords, acronyms):
    t = text.lower()

    t = ' '.join(t.split())

    t = BeautifulSoup(t, 'html.parser').get_text()

    # Chuẩn hóa láy âm tiết
    t = re.sub(r'(\D)\1+', r'\1', t)

    for key in acronyms:
        for value in acronyms[key]:
            if value in t:
                t = t.replace(value, key)

    # Xóa dấu
    # Tách từ
    t = ViTokenizer.tokenize(t)

    # t = unicodedata2.normalize('NFD', t).encode(
    #     'ascii', 'ignore').decode("utf-8")

    t = [x.strip(settings.SPECIAL_CHARACTER) for x in t.split()]

    t = [word for word in t if word not in stopwords]

    return " ".join(t)
コード例 #18
0
def clean_text(text, stopwords, acronyms):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

    t = text.lower()

    t = BeautifulSoup(t, 'html.parser').get_text()

    t = " ".join([x.strip(settings.SPECIAL_CHARACTER) for x in t.split()])

    print(t)

    # Chuẩn hóa láy âm tiết
    t = re.sub(r'(\D)\1+', r'\1', t)

    t = " " + t + " "

    for key in acronyms:
        for value in acronyms[key]:
            v = ' ' + value + ' '
            if v in t:
                t = t.replace(v, ' ' + key + ' ')

    # # Tách từ
    t = ViTokenizer.tokenize(t)

    t = [word for word in t.split() if word not in stopwords]

    return " ".join(t)
コード例 #19
0
def pyvi_prc(text):
    tokens, tags = ViPosTagger.postagging(ViTokenizer.tokenize(text))
    result = {}
    for i in range(len(tokens)):
        tokens[i] = tokens[i].replace('_', ' ')
        result[tokens[i]] = tags[i]
    return result
コード例 #20
0
def load_and_clean_data(doc):
    paths = glob.glob("./DataRaw/" + doc + "/*.txt")
    data = []
    for path in paths:
        with open(path, encoding="utf-8") as file:
            text = file.read()
            text_lower = text.lower()
            text_token = ViTokenizer.tokenize(text_lower)
            data.append(text_token)
        file.close()
    stop_words = []
    with open("./Stopword/vietnamese-stopwords.txt", encoding="utf-8") as f:
        text = f.read()
        for word in text.split():
            stop_words.append(word)
        f.close()
    punc_ = list(punctuation)
    stop_word = stop_words + punc_
    sentences = []
    for d in data:
        sent = []
        for word in d.split(" "):
            if word not in stop_word:
                if "_" in word or word.isalpha() is True:
                    sent.append(word)
        sentences.append(" ".join(sent))
    return sentences
    pass
コード例 #21
0
def main():
    keyword = input("Nhập từ khoá tìm kiếm: ")
    keyword_format = "%{}%".format(keyword)
    keyword_tokennize = tach_tu.tokenize(keyword)
    print("Từ khoá: ", keyword_tokennize)

    bow = str(keyword_tokennize).split(' ')
    print("bow tìm kiếm :", bow)
    word_dict = creat_word_count_dict(keyword_format)
    print("Số lần xuất hiện :", word_dict)
    # tính TF
    tf = compute_TF(word_dict, bow)
    print("Kết quả tf:", tf)
    # Tính IDF
    idf = compute_IDF(word_dict)
    print("Kết quả idf:", idf)
    # Cuối cùng: Tính TF-IDF Từ kết quả TF và IDF phía trên chỉ cần nhân lại là xong
    tf_idf = compute_TFIDF(tf, idf)
    for key, value in tf_idf.items():
        if (key == keyword_tokennize):
            print(key, ":", value)
    #print(sorted(tf_idf.values(),reverse=True))
    print(tf_idf)

    # vẽ biểu đồ
    df = pd.DataFrame([tf_idf])
コード例 #22
0
    def res_sentence(self, test_sentence):
        test_sentence = ViTokenizer.tokenize(test_sentence)
        test_sentence, pos = ViPosTagger.postagging(test_sentence)
        new_words, pos = self.process(test_sentence, pos)
        X_test = self.sent2features(new_words, pos)
        new_tags = self.crf.predict_single(X_test)
        st1, st2 = [], []
        for i in range(len(new_words)):
            if new_tags[i] == 'O':
                if new_tags[i - 1] != 'O':
                    st1.append(new_words[i])
                    st2.append('O')
                    print(i)
                    continue
                else:
                    if i == 0:
                        st1.append(new_words[i])
                        st2.append('O')
                    else:
                        st1[-1] = st1[-1] + '_' + new_words[i]
            elif new_tags[i][0] == 'B':
                tag = "" + new_tags[i][2:]

                st1.append(new_words[i])
                st2.append(tag.upper())

            elif new_tags[i][0] == 'I':
                st1[-1] = st1[-1] + '_' + new_words[i]
        return st1, st2
コード例 #23
0
def markdown_to_text(markdown_string,
                     parser="html.parser",
                     tags=['pre', 'code', 'a', 'img', 'i']):
    """ Converts a markdown string to plaintext
    https://stackoverflow.com/questions/18453176
    """

    import mistune  # noqa
    # md -> html -> text since BeautifulSoup can extract text cleanly
    markdown = mistune.Markdown()
    html = markdown(markdown_string)

    soup = BeautifulSoup(html, parser)
    # remove code snippets
    text = preprocessing_tags(soup, tags)

    text = remove_links_content(text)
    text = remove_emails(text)
    text = remove_punctuation(text)
    text = text.replace('\n', ' ')
    text = remove_numeric(text)
    text = remove_multiple_space(text)
    text = text.lower().strip()
    text = ViTokenizer.tokenize(text)
    text = remove_stopwords(text, stopwords=stopwords)

    return text
コード例 #24
0
def segment(tree):
    final_list = []
    for par in tree.findall(".//paragraph"):
        p = ""
        count_line = 0
        for line in par:
            count_line += 1
            t = ''
            for c in line:
                if c.text is None:
                    t += r"!0"
                else:
                    t += c.text
            p = p + " " + t[:-1].strip()
        if (len(par) > 1):
            p = ViTokenizer.tokenize(p).replace("_", " ").replace("-", "").replace("+", "")
            list_sentences = segmenter.segment_long(p.strip(), n_window=10)
        else:
            list_sentences = [p.strip()]
        for i in range(len(list_sentences)):
            if (len(list_sentences[i]) > 0 and list_sentences[i].strip()[-1] != '.'):
                list_sentences[i] = list_sentences[i] + " ."
        final_list.extend(list_sentences)
        # for sent in list_sentences:
        #     print(sent+"\n")
        # print("----------------------------------------------------------------------------------------")
    return final_list
コード例 #25
0
ファイル: demo.py プロジェクト: vudat1710/CRF
def main():
    loaded_model = pickle.load(
        open('finalized_model_no_pos_chunk_name_process.pkl', 'rb'))
    result = {}
    text = input('Enter some text: \n\n')
    tokenized = ViTokenizer.tokenize(text)
    raw_text = parse_raw_input(tokenized)
    word_featured = [get_features(s) for s in raw_text]
    preds = loaded_model.predict(word_featured)
    temp_sent_list = tokenized.split('.')
    sent_list = []
    for i in range(len(temp_sent_list)):

        if len(temp_sent_list[i]) > 0:
            sent_list.append(temp_sent_list[i].strip())
    print("\n\nResult : \n")
    for i in range(len(sent_list)):
        result = []
        current_sent = sent_list[i]
        current_tag = preds[i]
        tokens = current_sent.split(' ')
        if len(current_tag) > len(tokens):
            tokens.append('.')
        if len(current_sent) > 0:
            for j in range(len(tokens)):
                result.append([tokens[j], current_tag[j]])
        print(str(i) + " : ", end=" ")
        for part in result:
            if part[1] == "O":
                print(part[0], end=" ")

            else:
                # print("<"+part[1]+">" + part[0] +"</"+part[1]+">", end = " ")
                print(part[0] + "/" + part[1], end=" ")
        print("\n")
コード例 #26
0
    def buildSummary(self, sentences, n):
        sentences = sorted(sentences,
                           key=lambda x: x.getLexRankScore(),
                           reverse=True)
        summary = []
        i = 0
        length_summary = len(
            ViTokenizer.tokenize(sentences[i].getOGwords().strip()).split())
        while (length_summary < n):
            i += 1
            summary += [sentences[i]]
            length_summary += len(
                ViTokenizer.tokenize(
                    sentences[i].getOGwords().strip()).split())

        return summary
コード例 #27
0
def BigClassifier(contents):
    input_ = []
    contents = gensim.utils.simple_preprocess(contents)
    contents = ' '.join(contents)
    contents = ViTokenizer.tokenize(contents)
    contents = contents.split()
    result = [word for word in contents if word.lower() not in stop_word]
    contents = ' '.join(result)
    input_.append(contents)
    X_data.append(input_[0])
    tfidf_vect = TfidfVectorizer(analyzer='word',
                                 max_features=7000,
                                 max_df=0.8,
                                 min_df=1)
    tfidf_vect.fit(X_data)

    X_data_tfidf = tfidf_vect.transform(X_data)

    X_test_tfidf = X_data_tfidf[-1]
    X_data_tfidf = X_data_tfidf[0:6000]
    feature = tfidf_vect.get_feature_names()
    encoder = preprocessing.LabelEncoder()
    y_data_n = encoder.fit_transform(y_data)
    classifier = naive_bayes.MultinomialNB()
    classifier.fit(X_data_tfidf, y_data_n)
    test_predictions = classifier.predict(X_test_tfidf)[0]

    return (categorize[test_predictions])
コード例 #28
0
def clean_text(text, stopwords, acronyms):
    t = text.lower()

    t = ' '.join(t.split())

    t = BeautifulSoup(t, 'html.parser').get_text()

    # Chuẩn hóa láy âm tiết
    t = re.sub(r'(\D)\1+', r'\1', t)

    t = " ".join([x.strip(settings.SPECIAL_CHARACTER) for x in t.split()])

    t = ' ' + t + ' '

    for key in acronyms:
        for value in acronyms[key]:
            if value in t:
                t = t.replace(value, key)

    # Tách từ
    t = ViTokenizer.tokenize(t)

    t = [word for word in t if word not in stopwords]

    return " ".join(t)
コード例 #29
0
def exec(post, content):
    try:
        if post.source_info.name == 'V' and not post.has_summary:
            content.lower().strip()
            sentences = nltk.sent_tokenize(content)
            vocab = w2v.wv.vocab
            X = []
            for sentence in sentences:
                sentence = ViTokenizer.tokenize(sentence)
                words = sentence.split(" ")
                sentence_vec = np.zeros((100))
                for word in words:
                    if word in vocab:
                        sentence_vec += w2v.wv[word]
                X.append(sentence_vec)
            n_clusters = post.sentences_of_summary
            kmeans = KMeans(n_clusters=n_clusters)
            kmeans = kmeans.fit(X)
            avg = []
            for j in range(n_clusters):
                idx = np.where(kmeans.labels_ == j)[0]
                avg.append(np.mean(idx))
            closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                                       X)
            ordering = sorted(range(n_clusters), key=lambda k: avg[k])
            summary = ' '.join([sentences[closest[idx]] for idx in ordering])

            post.summary = summary
            post.has_summary = True
            post.save()
    except:
        post.has_summary = True
        post.save()
        return 'fail post ' + post.id
コード例 #30
0
 def segmentation(self, topic):
     # use collocation first
     temp1 = topic.lower()
     for collo in self._collocation:
         if collo in temp1:
             temp1 = temp1.replace(collo, collo.replace(' ', '_'))
     return ViTokenizer.tokenize(temp1)
コード例 #31
0
ファイル: __init__.py プロジェクト: IndicoDataSolutions/spaCy
 def make_doc(self, text):
     if self.Defaults.use_pyvi:
         try:
             from pyvi import ViTokenizer
         except ImportError:
             msg = ("Pyvi not installed. Either set Vietnamese.use_pyvi = False, "
                    "or install it https://pypi.python.org/pypi/pyvi")
             raise ImportError(msg)
         words, spaces = ViTokenizer.spacy_tokenize(text)
         return Doc(self.vocab, words=words, spaces=spaces)
     else:
         words = []
         spaces = []
         doc = self.tokenizer(text)
         for token in self.tokenizer(text):
             words.extend(list(token.text))
             spaces.extend([False]*len(token.text))
             spaces[-1] = bool(token.whitespace_)
         return Doc(self.vocab, words=words, spaces=spaces)