コード例 #1
0
ファイル: tokenize.py プロジェクト: wunajos/orange3-text
class WhitespaceTokenizer(BaseTokenizer):
    """ Split only by whitespace. """
    tokenizer = tokenize.WhitespaceTokenizer()
    name = 'Whitespace'
コード例 #2
0
class WhitespaceTokenizer(BaseTokenizer):
    """ 根据空白分词. This example. → (This), (example.)"""
    tokenizer = tokenize.WhitespaceTokenizer()
    name = '空白'
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk import tokenize

dados = pd.read_csv('imdb-reviews-pt-br.csv')

palavras = " ".join([texto for texto in dados.text_pt])

# Tokenização
tokenEspaco = tokenize.WhitespaceTokenizer()
token = tokenEspaco.tokenize(palavras)

frequencia = nltk.FreqDist(token)

dataframe = pd.DataFrame({
    "Palavra": list(frequencia.keys()),
    "Frequência": list(frequencia.values())
})

print(dataframe)

# Gráfico
dataframeMaiores = dataframe.nlargest(columns="Frequência", n=10)

plt.figure(figsize=(12, 8))
ax = sns.barplot(data=dataframeMaiores,
                 x="Palavra",
                 y="Frequência",
コード例 #4
0
import numpy as np
import nnabla as nn
import nnabla.solvers as S
import nnabla.functions as F
import nnabla.logger as logger
from nnabla.ext_utils import get_extension_context
from nltk import tokenize

import src.model as model
from .dataset import Dataset
from .grammar import Grammar, Rule, NodeType
from .python.grammar import to_ast
from .annotation import to_encoder_input, Annotation
from .decoder import Decoder

tokenizer = tokenize.WhitespaceTokenizer()

parser = argparse.ArgumentParser()
parser.add_argument('--context', "-c", type=str, default="cpu")
parser.add_argument('--max-query-length', type=int, default=70)
parser.add_argument('--max-action-length', type=int, default=100)
parser.add_argument('--embedding-size', type=int, default=128)
parser.add_argument('--node-type-embedding-size', type=int, default=64)
parser.add_argument('--lstm-state-size', type=int, default=256)
parser.add_argument('--hidden-state-size', type=int, default=50)
parser.add_argument('--result',
                    type=str,
                    default=os.path.join("result", "django"))
parser.add_argument('--dropout', type=float, default=0.2)
parser.add_argument('--beam-size', type=int, default=15)
args = parser.parse_args()
コード例 #5
0
ファイル: main.py プロジェクト: HANBO-S/testGitHub
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

from wordcloud import WordCloud
real_data = data[data[“target”] == “true”]
all_words = ‘ ‘.join([text for text in fake_data.text])
wordcloud = WordCloud(width= 800, height= 500, max_font_size = 110,
 collocations = False).generate(all_words)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation=’bilinear’)
plt.axis(“off”)
plt.show()

from nltk import tokenize
token_space = tokenize.WhitespaceTokenizer()
def counter(text, column_text, quantity):
    all_words = ' '.join([text for text in text[column_text]])
    token_phrase = token_space.tokenize(all_words)
    frequency = nltk.FreqDist(token_phrase)
    df_frequency = pd.DataFrame({"Word": list(frequency.keys()),
                                   "Frequency": list(frequency.values())})
    df_frequency = df_frequency.nlargest(columns = "Frequency", n = quantity)
    plt.figure(figsize=(12,8))
    ax = sns.barplot(data = df_frequency, x = "Word", y = "Frequency", color = 'blue')
    ax.set(ylabel = "Count")
    plt.xticks(rotation='vertical')
    plt.show()

    counter(data[data[“target”] == “fake”], “text”, 20)
コード例 #6
0
ファイル: tnt.py プロジェクト: sushengyang/NLP-project
def pos_tag(sentence, model_path=None, verbose=False):
    """
    Use TnT to parse a sentence
    
    @param sentence: Input sentence to parse
    @type sentence: L{str}
    @return: C{DepGraph} the dependency graph representation of the sentence
    """

    tnt_bin = config_tnt(verbose=verbose)

    if not model_path:
        model_path = '%s/models/wsj' % tnt_bin[:-4]

    input_file = '%s/tnt_in.txt' % tnt_bin[:-4]
    output_file = '%s/tnt_out.txt' % tempfile.gettempdir()

    execute_string = '%s %s %s > %s'
    if not verbose:
        execute_string += ' 2> %s/tnt.out' % tempfile.gettempdir()

    tagged_words = []

    f = None
    try:
        if verbose:
            print 'Begin input file creation'
            print 'input_file=%s' % input_file

        f = open(input_file, 'w')
        words = tokenize.WhitespaceTokenizer().tokenize(sentence)
        for word in words:
            f.write('%s\n' % word)
        f.write('\n')
        f.close()
        if verbose: print 'End input file creation'

        if verbose:
            print 'tnt_bin=%s' % tnt_bin
            print 'model_path=%s' % model_path
            print 'output_file=%s' % output_file

        execute_string = execute_string % (tnt_bin, model_path, input_file,
                                           output_file)

        if verbose:
            print 'execute_string=%s' % execute_string

        if verbose: print 'Begin tagging'
        tnt_exit = os.system(execute_string)
        if verbose: print 'End tagging (exit code=%s)' % tnt_exit

        f = open(output_file, 'r')
        lines = f.readlines()
        f.close()

        tagged_words = []
        tokenizer = tokenize.WhitespaceTokenizer()
        for line in lines:
            if not line.startswith('%%'):
                tokens = tokenizer.tokenize(line.strip())
                if len(tokens) == 2:
                    tagged_words.append((tokens[0], tokens[1]))

        if verbose:
            for tag in tagged_words:
                print tag

    finally:
        if f: f.close()

    return tagged_words
コード例 #7
0
            self.svd_transformer = TruncatedSVD(n_components=k)
        except Exception as ex:
            print(ex)

        return self.svd_transformer.fit(X)

    def transform(self, X, Y=None):
        return self.svd_transformer.transform(X)

    def get_params(self, deep=True):
        return {}


punctuation_token = tokenize.WordPunctTokenizer()
space_token = tokenize.WhitespaceTokenizer()
list_punctuation = [point for point in punctuation]
punctuation_stopwords = list_punctuation + stop_words
without_accents = []
without_accents_stop_words = []


@app.route('/')
def index():
    return flask.render_template('index.html')


def tokenize(df):
    processed_sentence = list()

    for sentence in df.sentence: