コード例 #1
0
def find(text):
    text = preprocess(text)
    text = ViTokenizer.tokenize(text)
    stopwords = pickle.load(open('RESTful/stopwords', 'rb'))
    vocal = pickle.load(open('RESTful/vocal', 'rb'))
    model = pickle.load(open('RESTful/model', 'rb'))

    vectorizer = TfidfVectorizer(stop_words=stopwords, vocabulary=vocal)
    corpus = [
        text,
    ]

    x = vectorizer.fit_transform(corpus)
    y = model.predict(x)

    file = open('data/data.txt')
    content = file.read()
    lines = content.split('\n')
    result = {}
    index = 0
    # tìm 5 kết quả
    while index < 5:
        i = np.random.random_integers(0, len(lines) - 50)
        line = lines[i]
        origin_line = line
        line = preprocess(line)
        line = ViTokenizer.tokenize(line)

        corpus = [
            line,
        ]
        x_find = vectorizer.fit_transform(corpus)
        y_find = model.predict(x_find)

        if (y_find == y):
            result[index] = origin_line
            index += 1
        if index > 5:
            break

    return result
コード例 #2
0
def preprocess(txt, tokenize=True):
    try:
        txt = re.sub(RE_HTML_TAG, ' ', txt)
        txt = re.sub('&.{3,4};', ' ', txt)
        if tokenize:
            txt = ViTokenizer.tokenize(txt)
        txt = txt.lower()
        txt = re.sub(RE_CLEAR, ' ', txt)
        return txt.strip()
    except:
        traceback.print_exc()
        return ''
コード例 #3
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        text = message.get(attribute)
        text = text.replace('òa', 'oà').replace('óa', 'oá').replace('ỏa', 'oả').replace('õa', 'oã').replace('ọa', 'oạ').replace('òe', 'oè').replace('óe', 'oé').replace('ỏe', 'oẻ').replace('õe', 'oẽ').replace('ọe', 'oẹ').replace('ùy', 'uỳ').replace('úy', 'uý').replace('ủy', 'uỷ').replace('ũy', 'uỹ').replace('ụy', 'uỵ')
        if self.tokenizer == 'underthesea':
            from underthesea import word_tokenize
            words = word_tokenize(text, format="text").split()
        else:
            from pyvi import ViTokenizer
            words = ViTokenizer.tokenize(text).split()
        text = ' '.join(words)

        return self._convert_words_to_tokens(words, text)
コード例 #4
0
def makeSummary(sentences, best_sentence, query, summary_length, lambta, IDF):
    summary = [best_sentence]

    sum_len = len(
        ViTokenizer.tokenize(best_sentence.getOriginalWords()).split())

    MMRval = {}

    # keeping adding sentences until number of words exceeds summary length
    while (sum_len <= summary_length):
        MMRval = {}

        for sent in sentences:
            MMRval[sent] = MMRScore(sent, query, summary, lambta, IDF)

        maxxer = max(MMRval, key=MMRval.get)
        summary.append(maxxer)
        sentences.remove(maxxer)
        sum_len += len(ViTokenizer.tokenize(maxxer.getOriginalWords()).split())

    return summary
コード例 #5
0
def tokenize_nmt(text, num_examples=None):
    """Tokenize the English-French dataset."""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            segmented = ViTokenizer.tokenize(parts[1])
            target.append(segmented.split(' '))
    return source, target
コード例 #6
0
ファイル: main.py プロジェクト: south1907/api-checkspelling
def process(str):
    str = tran(str)
    str = str.lower()
    str = ''.join(c for c in str if c not in punctuation)

    tach = ViTokenizer.tokenize(str)
    filtered_words = [
        word.replace("_", " ") for word in tach.split(" ")
        if word not in list_stopword
    ]

    return [str, filtered_words]
コード例 #7
0
def text_process(line_data):
    line_data = nlp.convert_unicode(line_data)
    # line_data = nlp.chuan_hoa_dau_cau_tieng_viet(line_data)   # bị lỗi chinh_tri_vẻ
    line_data = ViTokenizer.tokenize(line_data)
    line_data = line_data.lower()
    line_data = re.sub(r'\d', '', line_data).strip()
    line_data = re.sub(
        r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]',
        ' ', line_data)
    line_data = re.sub(r'\s+', ' ', line_data).strip()
    line_data = remove_stopword(line_data)
    return line_data
コード例 #8
0
def get_tokenizer(sentense):
    '''
        read the text then tokenizer
    '''

    sentense = sentense.lower()
    sentense = ViTokenizer.tokenize(sentense)

    temp = sentense.strip().split()
    delete_stop_words(temp)

    return temp
コード例 #9
0
def add_data_file():
    file = request.files['file']
    if file:
        filename = secure_filename(file.filename)
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(file_path)
    else:
        return jsonify('NOT FILE')

    with open(file_path) as json_file:
        data = json.load(json_file)
        data = list(data)
        for field in data:
            field["content"] = ViTokenizer.tokenize(field["content"])
            field["author"] = field["author"].strip().replace(' ', '_')
            field["title"] = ViTokenizer.tokenize(field["title"])
            field["description"] = ViTokenizer.tokenize(field["description"])
            field["topic"] = ViTokenizer.tokenize(field["topic"])
        solr.add(data)

    return jsonify("OK")
コード例 #10
0
def get_data(path):
    doc_data = get_datasets_localdata(path)
    X, y = doc_data.data, doc_data.target
    sw = stop_words(r"stopwords.txt")
    documents = []
    for x in X:
        doc = ViTokenizer.tokenize(x)
        doc = gensim.utils.simple_preprocess(doc)
        doc = " ".join(
            [word for word in doc if word.encode('utf-8') not in sw])
        documents.append(doc)
    return documents, y
コード例 #11
0
    def vi_term_tokenize(self, text):
        tokens = []
        text = self.__remove_html_tags(text)

        terms = ViTokenizer.tokenize(text)
        for term in terms.split(" "):
            if term.lower() not in stop_words.STOP_WORDS:
                if ("_" in term) or (term.isalpha() == True) and (
                        len(term) >= 3):
                    tokens.append(term)
        tokens = self.__standarlize_duplicate_token(tokens)
        return tokens
コード例 #12
0
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        m = ViTokenizer.tokenize(i).split(' ')
        for k in m:
            try:
                ints.append(dic[k])
            except Exception as e:
                # print(e)
                ints.append(2)
        X.append(ints)
    return X
コード例 #13
0
def word_segment(text, sw_file='./stopwords'):
    # Get stopword
    with open(sw_file, 'r') as f:
        sw = f.readlines()
    for i in range(len(sw)):
        sw[i] = sw[i].strip()

    # word segment
    text = ViTokenizer.tokenize(text)
    text = gensim.utils.simple_preprocess(text)
    text = [w for w in text if not w in sw]
    text = " ".join(text)
    return text
コード例 #14
0
def clean_data (content):
    list_words = ViTokenizer.tokenize(content).split()
    # Get stopword
    stopwords = []
    f = open('stopwords.txt', 'r', encoding="utf-8")
    for word in f:
        stopwords.append(word.strip())
    f.close()    
    words = [] # word after remove stop word
    for word in list_words:
        if word not in stopwords:
            words.append(word)
    return ' '.join(words)
コード例 #15
0
def get_data(path):
    doc_data = get_datasets_localdata(path)
    X, y = doc_data.data, doc_data.target
    sw = stop_words(r"stopwords.txt")
    documents = []
    for x in X:
        doc = ViTokenizer.tokenize(x)
        doc = re.sub(r'^https?:\/\/.*[\r\n]*', '', doc, flags=re.MULTILINE)
        doc = re.sub(" \d+", " ", doc)
        doc = gensim.utils.simple_preprocess(doc)
        doc = " ".join([word for word in doc if word.encode('utf-8') not in sw])
        documents.append(doc)
    return documents, y
コード例 #16
0
def preprocess_vi(chatbot, statement):
    """
    Remove any consecutive whitespace characters from the statement text.
    """
    import re
    import pyvi.ViTokenizer as tokenizer

    tokenized_text = tokenizer.tokenize(statement.text)
    statement.add_extra_data('tokenized_text', tokenized_text)

    #statement.text = statement.text.lower()

    return statement
コード例 #17
0
def read_data_from_file_to_list(file_name):
    X = []
    y = []
    with open(file_name) as lines:
        for line in lines:
            try:
                json_data = json.loads(line)
                X.append(ViTokenizer.tokenize(json_data['comment']))
                star_num = int(json_data['star'])
                y.append(get_sentiment_from_star(star_num))
            except:
                print(line)
    return X, y
コード例 #18
0
    def tokenize(self, text, index=-1):
        """

        :param text:
        :return: list
        """
        if index != -1:
            logging.debug('Tokenize count: %s', index)
        if index == 23730:
            logging.debug('F*****g text: %s', text)
        result = ViTokenizer.tokenize(text).split(' ')

        return result
コード例 #19
0
 def preprocess(self, txt, tokenize=True):
     txt = re.sub('&.{3,4};', ' ', txt)
     txt = utils.convertwindown1525toutf8(txt)
     if tokenize:
         txt = ViTokenizer.tokenize(txt)
     txt = txt.lower()
     txt = self.replace_common_token(txt)
     txt = self.remove_emoji(txt)
     txt = re.sub(RE_CLEAR_1, ' ', txt)
     txt = re.sub(RE_CLEAR_2, ' ', txt)
     txt = re.sub(RE_CLEAR_3, ' ', txt)
     txt = utils.chuan_hoa_dau_cau_tieng_viet(txt)
     return txt.strip()
コード例 #20
0
def bag_of_words(s, words):
    bag = [0 for _ in range(len(words))]
    
    s_words = ViTokenizer.tokenize(s).split()
    #s_words = nltk.word_tokenize(s)
    s_words = [word.lower() for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1
            
    return numpy.array(bag)
コード例 #21
0
ファイル: pyvi_test.py プロジェクト: bangjdev/HEKATEAI-OA
def extract_name(text, stopwords):
    tokenized_text = ViTokenizer.tokenize(text)
    tokenized_text = clean_text(tokenized_text, stopwords)
    words, tags = ViPosTagger.postagging(tokenized_text)

    res = []

    for i in range(len(words)):
        if (tags[i] == "Np"):
            # print(words[i])
            res.append(words[i].replace("_", " "))

    return res
コード例 #22
0
ファイル: views.py プロジェクト: lynx97/text-summarizer-demo
def sent_embedding_with_w2v(text, sentences):
    w2v = Word2Vec.load("/home/thangnd/git/python/Vietnamese_doc_summarization_basic/vi/vi.bin")
    vocab = w2v.wv.vocab
    X = []
    for sentence in sentences:
        sentence = ViTokenizer.tokenize(sentence)
        words = sentence.split(" ")
        sentence_vec = np.zeros((100))
        for word in words:
            if word in vocab:
                sentence_vec+=w2v.wv[word]
        X.append(sentence_vec)
    return X
コード例 #23
0
def word_count(text_list):
    str = " ".join(text_list)
    str = ViTokenizer.tokenize(str)
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts
コード例 #24
0
def predict_specific_content():
    file_path = os.path.join(dir_path, "specific_test.txt")
    content = []
    file = open(file_path, 'r', encoding="utf-8")
    line = file.readlines()
    line = ' '.join(line)
    line = gensim.utils.simple_preprocess(line)
    line = ' '.join(line)
    line = ViTokenizer.tokenize(line)
    content.append(line)

    content_data_tfidf = tfidf_vector.transform(content)
    prediction = trained_model.predict(content_data_tfidf)
    return prediction[0]
コード例 #25
0
def get_top_n_words_tf(doc, n=None):
    with open("stopwords_vn.txt") as f:
        content = f.readlines()
    stopwords = frozenset([x.strip() for x in content])
    words = ViTokenizer.tokenize(convert_text(doc))
    vec = CountVectorizer(stop_words=stopwords).fit([words])
    bag_of_words = vec.transform([words])
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    topn = words_freq[:n]
    topn = [item[0] for item in topn]
    return topn
コード例 #26
0
 def getKeyWord(self):
     doc = self.text
     content = u''
     for i in self.getContent().lower().split('\n'):
         t = ViTokenizer.tokenize(i)
         content = content + t + u'\n'
     tfidfDict = tfidf(content)
     listkey_init = top(10, tfidfDict)
     tm = list(top(len(list(tfidfDict.keys())) * 4 / 5, tfidfDict).keys())
     global pharse
     pharse = []
     for word in listkey_init:
         generatePharse(word, content, doc, tm)
     return pharse
コード例 #27
0
def tachTu(file):
    with io.open(file, 'r', encoding='utf8') as f:
        data = json.load(f)
        newData = []
        for i in range(0, len(data)):
           newString = ViTokenizer.tokenize(data[i]['comment'])
           newData.append({
               'rating' : data[i]['rating'],
               'comment' : newString
           })
           # print(newData[i]['comment'])

        with io.open(file, 'w', encoding='utf8') as f:
            json.dump(newData, f)
コード例 #28
0
ファイル: controller.py プロジェクト: cuong1181998/IR-IT4853
def search_synonym (query):
    try:
        solr = connect_solr()
        list_words = ViTokenizer.tokenize(query).split()
        stopwords = utils.get_stopwords()

        words = [] # word after remove stop word
        for word in list_words:
            if word not in stopwords:
                words.append(word)
        
        
    except Exception:
        print("[ERROR] search synoym error: Something went wrong!")
コード例 #29
0
def get_tokenizer(link):
    '''
        read the text then tokenizer
    '''

    with open(link, 'r', encoding='utf-8') as f:
        sentense = f.read()
        sentense = sentense.lower()
        sentense = ViTokenizer.tokenize(sentense)

    temp = sentense.strip().split()
    delete_stop_words(temp)

    return temp
コード例 #30
0
def preprocess_text(text):
    text = parse_html_v2(text)
    text = text.lower()
    text = remove_links_content(text)
    text = remove_emails(text)
    text = remove_special_tags(text)  # remove content between {}
    text = remove_punctuation(text)  # remove all puntuations
    text = split_alphanum(text)  # add space between word and numeric
    text = strip_numeric(text)  # remove digits
    text = strip_non_alphanum(text)  # remove non-alphabetic characters
    text = strip_short(text, minsize=2)  # remove word with length < minsize
    text = remove_multiple_space(text).strip()  # remove space and strip
    text = ViTokenizer.tokenize(text)
    return text