Python tokenizeの例、tokens.tokenize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: rte.py プロジェクト: gchrupala/reimaginet

 def tokens(s):
     if tokenizer == 'words':
         return tokenize(s)
     elif tokenizer == 'raw':
         return s
     else:
         raise ValueError("Unknown tokenizer {}".format(tokenizer))

コード例 #2

0

ファイルを表示

 def tokens(s):
     if tokenizer == 'words':
         return tokenize(s)
     elif tokenizer == 'raw':
         return s
     else:
         raise ValueError("Unknown tokenizer {}".format(tokenizer))

コード例 #3

0

ファイルを表示

ファイル: parser.py プロジェクト: testrocket/expr-calc

def parse_rpn(expr):

    tokens_list = tokens.tokenize(expr)

    result, tmp = [], []
    i, n = 0, len(tokens_list)
    while i < n:
        token = tokens_list[i]
        if token.ttype in [tokens.Token.INT, tokens.Token.DOUBLE]:
            result.append(token)
        elif token.ttype == tokens.Token.ID:
            if (i + 1) < n and tokens_list[i + 1].ttype == tokens.Token.PAREN_LEFT:
                tmp.append(token)
            else:
                result.append(token) 
        elif token.ttype == tokens.Token.PAREN_LEFT:
            tmp.append(token)
        elif token.ttype in tokens.Token.OPERATORS:
            op_priority = operator_priority(token)
            while tmp and operator_priority(tmp[len(tmp) - 1]) >= op_priority:
                result.append(tmp.pop())
            tmp.append(token)
        elif token.ttype == tokens.Token.PAREN_RIGHT:
            while tmp and tmp[len(tmp) - 1].ttype != tokens.Token.PAREN_LEFT:
                result.append(tmp.pop())
            tmp.pop()

            if tmp and tmp[len(tmp) - 1].ttype == tokens.Token.ID:
                result.append(tmp.pop())

        i += 1

    while tmp:
        result.append(tmp.pop())
    return result

コード例 #4

0

ファイルを表示

ファイル: converter.py プロジェクト: alexbagirov/py-man2html

    def emphasize(self, line) -> None:
        if not len(line) > 2:
            return
        first_modifier = line[1]
        second_modifier = None
        if str.isalpha(line[2]):
            second_modifier = line[2]
        self.last_line_length = len(line) - 2

        first_markup, second_markup = '{}', None
        if first_modifier == 'B':
            first_markup = BOLD_MARKUP
        elif first_markup == 'I':
            first_markup = ITALICS_MARKUP
        if second_modifier == 'B':
            second_markup = BOLD_MARKUP
        elif second_modifier == 'I':
            second_markup = ITALICS_MARKUP
        elif second_modifier == 'R':
            second_markup = '{}'

        tokens = tokenize(line)
        result = ''
        for i in range(1, len(tokens)):
            if i % 2 == 1:
                result += first_markup.format(tokens[i])
            else:
                if second_markup:
                    result += second_markup.format(tokens[i])
                else:
                    result += first_markup.format(tokens[i])

        _result = self.replace_special_symbols(result)
        self.last_line_length = len(self.clean_html(_result))
        self.body += _result

コード例 #5

0

ファイルを表示

def parse(code, cx):
    toks = [tokens.tok(t) for t in tokens.tokenize(code)] + [None]
    p = Parser(toks, cx.s)
    tree = p.expr()
    if p.tl[:1] != [None]:
        raise ParseException()
    if len(p.tl) > 1:
        cx.warn('not all tokens used')
    return tree

コード例 #6

0

ファイルを表示

ファイル: driver.py プロジェクト: lgelderloos/reimaginet

def tokens(sent, tokenizer='word'):
    if tokenizer == 'word':
        return tokenize(sent['raw'])
    elif tokenizer == 'word-clean':
        return sent['tokens']
    elif tokenizer == 'char':
        return list(sent['raw'])
    elif tokenizer == 'phon':
        # remove spaces/punctuation, and lowercase
        return [c.lower() for c in sent['raw'] if c in string.letters]

コード例 #7

0

ファイルを表示

ファイル: driver.py プロジェクト: gchrupala/reimaginet

def tokens(sent, tokenizer='word'):
    if tokenizer == 'word':
        return tokenize(sent['raw'])
    elif tokenizer == 'word-clean':
        return sent['tokens']
    elif tokenizer == 'char':
        return list(sent['raw'])
    elif tokenizer == 'phon':
        # remove spaces/punctuation, and lowercase
        return [ c.lower() for c in sent['raw'] if c in string.letters ]

コード例 #8

0

ファイルを表示

ファイル: scripts.py プロジェクト: reynull20/Search-Engine

def generate_positional_score(all_tokens, soup):
    '''This function goes through all the tokens for a given webpage, and determines its positional score by examining the webpage's soup object
        Using the POSITIONAL_SCORING_METRIC dictionary, it assigns a score to the token
        @param all_tokens: all the tokens appearing in a given webpage
        @param soup: the BeautifulSoup object for the webpage in question
    '''
    result = defaultdict(float)
    header_tokens = tokens.tokenize(get_all_header_strings(soup))
    try:
        title_tokens = tokens.tokenize(soup.title.get_text())
    except AttributeError:
        title_tokens = dict()
    for token, freq in all_tokens.items():
        result[token] += POSITIONAL_SCORING_METRIC['body'] 
        if(token in header_tokens.keys()):
            result[token] += POSITIONAL_SCORING_METRIC['header']
        if(token in title_tokens.keys()):
            result[token] += POSITIONAL_SCORING_METRIC['title']

    return result

コード例 #9

0

ファイルを表示

ファイル: sciencere.py プロジェクト: ameerwasi001/MagnetScript

def file(path):
    directory = os.path.dirname(os.path.realpath(os.path.abspath(path)))
    file = open(path, "r+")
    lines = file.readlines()
    linenum = 0
    while (linenum < len(lines)):
        lines[linenum] = tokens.tokenize(lines[linenum],
                                         directory=relpath(path))
        linenum += 1
    content = '\n'.join(lines)
    exec(content)

コード例 #10

0

ファイルを表示

ファイル: scripts.py プロジェクト: reynull20/Search-Engine

def read_corpus_into_db(crawled_dict):
    '''Using the bookkeeping dictionary, we write all of the webpage information into the database using this function.
        @param crawled_dict: The bookkeeping dictionary read from bookkeeping.json
    '''
    for path, url in crawled_dict.items():
        print("Processing:",path, ' ', url)
        soup = create_soup(open(RAW_DATA_FOLDER + '/' + path))
        try:
            title = soup.title.get_text()
        except AttributeError:
            title = ''
        all_tokens = tokens.tokenize(soup.get_text())
        print("Tokenized: ", title)
        database.write_webpage(path,url,title,all_tokens)

コード例 #11

0

ファイルを表示

def mgs_require(module_name):
    file = open(module_name, "r+")
    lines = file.readlines()
    linenum = 0
    lines.insert(0, "from functions import * \n")
    while (linenum < len(lines)):
        lines[linenum] = tokens.tokenize(lines[linenum],
                                         directory=relpath(module_name))
        linenum += 1
    source = '\n'.join(lines)
    module = types.ModuleType(module_name)
    exec(source, module.__dict__)
    sys.modules[module_name] = module
    return module

コード例 #12

0

ファイルを表示

ファイル: parser.py プロジェクト: trevorsummerssmith/vinge

def parse_log(lines):
    """
    Args:
      lines (iterable of str) Assumes the trailing newline is part of the line

    Returns:
       (list of LogLineVertex,
        dict(str -> list of LogLineVertex),
        dict(str -> list of LogLineVertex))
        List of the actual log line vertices,
        dict from tags to the vertices that have it,
        dict from ids to the vertices that have it
    """
    log_lines = []
    tag_map = {}
    id_map = {}
    for line_number, line in enumerate(lines):
        line = line.rstrip()
        ret = _parse_log_line(line)
        # TODO(trevor) for now skip non conforming lines
        if ret is None:
            continue
        (dt, thread_id, msg) = ret

        # 1 Create log line vertex for each line
        vertex = LogLineVertex(line, msg, line_number, thread_id, dt)
        log_lines.append(vertex)

        # 2 Look for tokens, then create separate tag and id maps
        # Each map is from token to list of associated vertices
        tokens = tokenize(msg)
        for (token, token_type) in tokens:
            if token_type == TokenType.TAG:
                this_map = tag_map
            elif token_type == TokenType.ID:
                this_map = id_map
            else:
                # Skip stop words and spaces
                continue
            # We use a set here so that a line which has a word more than once
            # doesn't get counted twice
            this_set = this_map.get(token, set())
            this_set.add(vertex)
            this_map[token] = this_set
    # Turn the sets back to lists
    _dict_map(tag_map, lambda s: list(s))
    _dict_map(id_map, lambda s: list(s))

    return (log_lines, tag_map, id_map)

コード例 #13

0

ファイルを表示

def parse(s):
    toks = tokens.tokenize(s)
    if (len(toks) >= 4 and toks[0].isIdentifier() and toks[1].isSymbol("=")):
        varname = toks[0].value
        toks.pop(0)
        toks.pop(0)
        value = parse_expression(toks)
        if not toks[0].isStop():
            raise InputError("Expected operator or end of input", toks[0])
        variables[varname] = value
        print("%s = %g" % (varname, value))
    else:
        result = parse_expression(toks)
        if not toks[0].isStop():
            raise InputError("Expected operator or end of input", toks[0])
        print("==> %g" % result)

コード例 #14

0

ファイルを表示

ファイル: converter.py プロジェクト: alexbagirov/py-man2html

    def new_subheader(self, subheader_title: str) -> None:
        if self.paragraph:
            self.body += '</div>'
            self.paragraph = False
        if self.subheader:
            self.body += '</div>'
            self.subheader = False

        self.subheader = True
        _title = tokenize(subheader_title.replace('.SS', '').strip())[0]
        self.body += '<h3 id="{}_{}">{}</h3>'.format(self.h_id - 1,
                                                     self.sub_id, _title)
        self.body += '<div style="padding-left: 4em;">'

        self.contents[str(self.h_id - 1)]['sub'][str(self.sub_id)] = _title
        self.sub_id += 1

コード例 #15

0

ファイルを表示

ファイル: caching_generator.py プロジェクト: LanceNorskog/deep_meter

def encode_line(line, cmudict, syll_mgr):
    global total
    global success
    total += 1
    line = tokens.clean(line)
    words = tokens.tokenize(line)
    words = tokens.fixtokens(words)
    words = tokens.hyphen(words, cmudict.syll_dict)
    encs = array.array('H')
    for word in words:
        sylls = cmudict.get_syllables(word.lower())
        if sylls == None or len(sylls) == 0:
            continue
        for syll in sylls[0]:
            enc = syll_mgr.get_encoding(syll)
            if enc != syllables.unknown_encoding:
                encs.append(enc)
    return encs

コード例 #16

0

ファイルを表示

ファイル: plinth.py プロジェクト: jasontbradshaw/plinth

    def cmd(self, source):
        '''
        The standard interpreter loop. Reads input, parses it, evaluates it, and
        writes the result to stdout. Continues to run until the user exits by
        sending an EOF.
        '''

        # exit when an EOF is received
        if isinstance(source, EOFError):
            sys.stdout.write(os.linesep)
            return True
        elif isinstance(source, KeyboardInterrupt):
            # clear the line and reset the source when an interrupt is received
            self.prompt = self.standard_prompt
            self.source = u''
            sys.stdout.write(os.linesep)
            return

        # otherwise, parse and evaluate the source code
        try:
            self.source += source

            # evaluate every entered expression sequentially
            for result in parse(tokens.tokenize(self.source)):
                self.stdout.write(util.to_string(evaluate(result, self.env)) +
                        os.linesep)

            # reset the prompt and source
            self.prompt = self.standard_prompt
            self.source = u''

        # allow the user to finish entering a correct expression
        except errors.ParserError:
            self.prompt = self.continue_prompt
            self.source += os.linesep

        # write all other problems and clear source
        except Exception, e:
            traceback.print_exc(file=self.stdout)

            # reset the source and prompt for the next parse
            self.source = u''
            self.prompt = self.standard_prompt

コード例 #17

0

ファイルを表示

ファイル: converter.py プロジェクト: alexbagirov/py-man2html

    def new_header(self, header_title: str) -> None:
        if self.paragraph:
            self.body += '</div>'
            self.paragraph = False
        if self.subheader:
            self.body += '</div>'
            self.subheader = False
        if self.header:
            self.body += '</div>'
            self.header = False

        self.header = True
        _title = tokenize(header_title.replace('.SH', '').strip())[0]
        self.body += '<h2 id="{}">{}</h2>'.format(self.h_id, _title)
        self.body += '<div style="padding-left: 4em;">'

        self.contents[str(self.h_id)] = {'title': _title,
                                         'sub': {}}
        self.h_id += 1
        self.sub_id = 1

コード例 #18

0

ファイルを表示

ファイル: generator_text.py プロジェクト: LanceNorskog/deep_meter

def encode_line(line, cmudict, syll_mgr):
    global total
    global success
    total += 1
    line = tokens.clean(line)
    words = tokens.tokenize(line)
    words = tokens.fixtokens(words)
    words = tokens.hyphen(words, cmudict.syll_dict)
    encs = []
    for word in words:
        sylls = cmudict.get_syllables(word.lower())
        if sylls == None or len(sylls) == 0:
            return None
        for syll in sylls[0]:
            enc = syll_mgr.get_encoding(syll)
            if enc != syllables.unknown_encoding:
                encs.append(enc)
    labels = [0] * syll_mgr.get_size()
    for enc in encs:
        labels[enc] = 1
    success += 1
    return labels

コード例 #19

0

ファイルを表示

ファイル: format.py プロジェクト: trevorsummerssmith/vinge

def format_log_line_vertex(vertex):
    """
    Formats the log line as a string with ids and tags highlighted.

    Args:
        vertex (vertex.LogLineVertex)

    Returns: str
    """
    msg = ""
    # Tokenize the message
    for (token, token_type) in tokenize(vertex.message):
        fmt = token
        if token_type is TokenType.TAG:
            fmt = format_tag(fmt)
        elif token_type is TokenType.ID:
            fmt = format_id(fmt)
        msg += "%s" % fmt

    # Rebuild the log message now
    ret = "%s [%s] %s" % (vertex.time, format_tag(vertex.thread_id), msg)

    return ret

コード例 #20

0

ファイルを表示

def handle_query(query_str):
    '''This function is called when a query is presented. It tokenizes the query, and generates the results for the query
        @param query_str: The string entered by the user which represents the query.
    '''
    query_tokens = tokens.tokenize(query_str)
    all_hits = defaultdict()
    if len(query_tokens) == 0:
        raise KeyError()
    if len(query_tokens.items()) == 1:
        for query in query_tokens.keys():
            top_fifteen = dict(
                sorted(scripts.database.get_webpages(query).items(),
                       key=lambda x: x[1],
                       reverse=True)[:15])
        return create_engine_info(top_fifteen)
    for query in query_tokens.keys():
        hits = scripts.database.get_webpages(query)
        if hits == None or hits == {}:
            continue
        all_hits[query] = dict(
            sorted(hits.items(), key=lambda x: x[1], reverse=True)[:50])
    top_fifteen = get_top_fifteen(all_hits)
    return create_engine_info(top_fifteen)

コード例 #21

0

ファイルを表示

ファイル: word_generator.py プロジェクト: sIncerass/ELSA

    def get_words(self, sentence):
        """ Tokenizes a sentence into individual words.
            Converts Unicode punctuation into ASCII if that option is set.
            Ignores sentences with Unicode if that option is set.
            Returns an empty list of words if the sentence has Unicode and
            that is not allowed.
        """

        if not isinstance(sentence, unicode):
            raise ValueError("All sentences should be Unicode-encoded!")
        sentence = sentence.strip().lower()

        if self.break_replacement:
            sentence = convert_linebreaks(sentence)

        if self.remove_variation_selectors:
            sentence = remove_variation_selectors(sentence)

        # Split into words using simple whitespace splitting and convert
        # Unicode. This is done to prevent word splitting issues with
        # twokenize and Unicode
        words = sentence.split()
        if True:
        # if not JAPAN:
            converted_words = []
            for w in words:
                accept_sentence, c_w = self.convert_unicode_word(w)
                # Unicode word detected and not allowed
                if not accept_sentence:
                    return []
                else:
                    converted_words.append(c_w)
            sentence = ' '.join(converted_words)
            words = tokenize(sentence)
        words = [process_word(w) for w in words]
        return words

コード例 #22

0

ファイルを表示

ファイル: plinth.py プロジェクト: jasontbradshaw/plinth

 def parse_file(self, path):
     '''Reads a file, parses it, and returns the AST.'''
     with open(os.path.abspath(path), 'r') as f:
         return parse(tokens.tokenize(util.file_char_iter(f)))

コード例 #23

0

ファイルを表示

ファイル: plinth.py プロジェクト: jasontbradshaw/plinth

def evaluate(sexp, env):
    '''
    Given an Atom or list, evaluates it using the given environment
    (global by default) and returns the result as represented in our language
    constructs.
    '''

    # symbol
    if isinstance(sexp, lang.Symbol):
        # look it up in the environment for its value
        return env[sexp]

    # atom (not a literal list)
    elif not lang.Cons.is_list(sexp):
        # it's a generic atom and evaluates to itself
        return sexp

    # list
    else:
        # we can't evaluate functions that have nothing in them
        if len(sexp) == 0:
            raise errors.ApplicationError('nothing to apply')

        # evaluate functions using their arguments
        function = evaluate(sexp.car, env)
        args = sexp.cdr

        # make sure our first item evaluated to a function
        if not isinstance(function, lang.Callable):
            raise errors.ApplicationError('wrong type to apply: ' +
                    str(function))

        # quote
        if function is primitives.quote:
            # return the argument unevaluated
            util.ensure_args(args, num_required=1)
            return args.car

        # quasiquote
        elif function is primitives.quasiquote:
            util.ensure_args(args, num_required=1)
            return quasiquote_evaluate(args.car, env)

        # function
        elif function is primitives.lambda_:
            util.ensure_args(args, num_required=2)

            arg_symbols = args.car
            body = args.cdr.car

            # return a function with the current environment as the parent
            return lang.Function(evaluate, env, arg_symbols, body)

        # macro
        elif function is primitives.macro:
            util.ensure_args(args, num_required=2)

            arg_symbols = args.car
            body = args.cdr.car

            # return a macro with the given symbols and body
            return lang.Macro(evaluate, env, arg_symbols, body)

        # macro expand
        elif function is primitives.expand:
            util.ensure_args(args, num_required=1, is_variadic=True)

            # evaluate to get the macro and its arguments
            m = evaluate(args.car, env)
            arg_expressions = [evaluate(arg, env) for arg in args.cdr]

            # make sure we got a macro
            util.ensure_type(lang.Macro, m)

            return m(evaluate, env, *arg_expressions)

        # define
        elif function is primitives.define:
            util.ensure_args(args, num_required=2)

            symbol = args.car
            value = args.cdr.car

            # make sure we're defining to a symbol
            util.ensure_type(lang.Symbol, symbol)

            # evaluate the argument, map the symbol to the result in the current
            # environment, then return the evaluated value. this allows for
            # chains of definitions, or simultaneous variable assignments to the
            # same value.
            result = evaluate(value, env)
            env[symbol] = result

            # set the function or macro name if possible
            if isinstance(result, lang.Callable):
                result.name(symbol.value)

            return result

        # cond
        elif function is primitives.cond:
            for tup in args:
                # if e is not a list, len() raises an error for us
                if len(tup) != 2:
                    # make sure each is a list of exactly two expressions
                    s = 'expected 2 expressions, got ' + str(len(tup))
                    raise errors.IncorrectArgumentCountError(s)

                # first and second list items are condition and result
                condition = tup.car
                result = tup.cdr.car

                # evaluate and return the result if condition is true
                if evaluate(condition, env):
                    return evaluate(result, env)

            # if no result is returned, result is undefined
            raise errors.ApplicationError('at least one condition must ' +
                    'evaluate to ' + tokens.TRUE)

        # logical and
        elif function is primitives.and_:
            util.ensure_args(args, num_required=2, is_variadic=True)

            # evaluate the arguments, returning the final one if none were #f,
            # otherwise the last evaluated item, #f.
            last_item = None
            for item in args:
                last_item = evaluate(item, env)
                if last_item is False:
                    break

            return last_item

        # logical or
        elif function is primitives.or_:
            util.ensure_args(args, num_required=2, is_variadic=True)

            # evaluate the arguments, returning the first one that's not #f,
            last_item = None
            for item in args:
                last_item = evaluate(item, env)
                if not last_item is False:
                    break

            return last_item

        # eval
        elif function is primitives.eval_:
            util.ensure_args(args, num_required=1)

            # evaluate the given s-expression and return it
            return evaluate(evaluate(args.car, env), env)

        # load
        elif function is primitives.load:
            util.ensure_args(args, num_required=1)
            util.ensure_type(basestring, args.car)

            # evaluate every expression in the file in sequence, top to bottom
            with open(os.path.abspath(args.car), 'r') as f:
                for result in parse(tokens.tokenize(util.file_char_iter(f))):
                    evaluate(result, env)

            # return that we were successful
            return True

        # evaluate macros
        elif isinstance(function, lang.Macro):
            # evaluate the expanded form of the macro in the current environment
            return evaluate(function(evaluate, env, *args), env)

        else:
            # evaluate args and call the function with them
            return function(evaluate, *[evaluate(arg, env) for arg in args])

コード例 #24

0

ファイルを表示

from sets import Set
import sys

import cmudict
import tokens

cmudict = cmudict.CMUDict()

wordset = Set([])
for line in sys.stdin:
    text = line.split("\t")[0]
    words = tokens.tokenize(line)
    words = tokens.fixtokens(words)
    words = tokens.hyphen(words, cmudict.syll_dict)
    for word in words:
        wordset.add(word)

for word in sorted(wordset):
    print(word)

コード例 #25

0

ファイルを表示

ファイル: driver.py プロジェクト: gchrupala/reimaginet

def batch_sents(batcher, sents):
    return batcher.batch_inp(list(batcher.mapper.transform([ tokenize(s) for s in sents ])))

コード例 #26

0

ファイルを表示

def processed_input(str_to_show):
    content = input(str_to_show)
    content = tokens.tokenize(content)
    return content

コード例 #27

0

ファイルを表示

ファイル: driver.py プロジェクト: lgelderloos/reimaginet

def batch_sents(batcher, sents):
    return batcher.batch_inp(
        list(batcher.mapper.transform([tokenize(s) for s in sents])))

コード例 #28

0

ファイルを表示

ファイル: parser.py プロジェクト: lamielle/simphtml

	def parse(self, lines):
		"""Parse the given lines of text into a AST that represents the simple HTML
document in the text.  Raises a ParseError for parsing problems and a
TokenizeError for tokenization problems."""
		self.tokens = tokenize(lines, True)
		return Elems(self._elems())

コード例 #29

0

ファイルを表示

ファイル: sentiment.py プロジェクト: nkoilada/twitter_sentiment

import pickle
from sklearn.feature_extraction.text import CountVectorizer

params = {"q": sys.argv[1],
		  "lang": "en",
		  "rpp": 100,
		  "result_type": "recent"}

api_base_url = "http://search.twitter.com/search.json"

raw_data = json.load(urllib2.urlopen(api_base_url + "?" + urllib.urlencode(params)))

# Load Tweets
tweets = []
for i in raw_data["results"]:
	tweets.append(tokens.tokenize(i["text"]))

# Filter out Non-English Tweets
tweets = lang_detector.filter_tweets(tweets)

# Load Classifier
f = open(sys.argv[2], 'rb')
classifier = pickle.load(f)
f.close()

vectorizer = CountVectorizer()
X = vectorizer.fit_transform([" ".join(t) for t in tweets])
y = classifier.predict_proba(X)

print "Positive, Negative :", 100 * sum(y) / sum(sum(y))

コード例 #30

0

ファイルを表示

from rnn import RNN
from tokens import tokenize

f = open('input.txt', 'r')
sentences = list(filter(None, f.read().split('\n\n')))
f.close()
# print((sentences))
tokens = []
X = []
Y = []
# word_dim = 58
for i in range(len(sentences)):
    tokens.append([])
    for j in range(len(sentences[i])):
        if (tokenize(sentences[i][j]) != "UNKNOWN"):
            tokens[i].append(tokenize(sentences[i][j]))
    X.append(tokens[i][:-1])
    Y.append(tokens[i][1:])

model = RNN(61, 200, 10)
model.load('uwv.pkl')
model.train(X, Y, 0.1, 10, 1)

コード例 #31

0

ファイルを表示

ファイル: sentiment.py プロジェクト: nkoilada/twitter_sentiment

#import lang_detector_em as lang_detector
import lang_detector
import pickle
from sklearn.feature_extraction.text import CountVectorizer

params = {"q": sys.argv[1], "lang": "en", "rpp": 100, "result_type": "recent"}

api_base_url = "http://search.twitter.com/search.json"

raw_data = json.load(
    urllib2.urlopen(api_base_url + "?" + urllib.urlencode(params)))

# Load Tweets
tweets = []
for i in raw_data["results"]:
    tweets.append(tokens.tokenize(i["text"]))

# Filter out Non-English Tweets
tweets = lang_detector.filter_tweets(tweets)

# Load Classifier
f = open(sys.argv[2], 'rb')
classifier = pickle.load(f)
f.close()

vectorizer = CountVectorizer()
X = vectorizer.fit_transform([" ".join(t) for t in tweets])
y = classifier.predict_proba(X)

print "Positive, Negative :", 100 * sum(y) / sum(sum(y))

コード例 #32

0

ファイルを表示

ファイル: generate.py プロジェクト: Divyaadurthy-1/test-programing-

from rnn import RNN
from tokens import tokenize, detokenize

model = RNN(61,200)
model.load('uwv.pkl')

# print(model.U.shape)
# generate(start_token, num_of_chars, top_k random predictions)
text = model.generate(tokenize('F'), 300, 3)

for i in text:
    print(detokenize(i), end="")

コード例 #33

0

ファイルを表示

ファイル: build_lm.py プロジェクト: LanceNorskog/deep_meter

import tokens

# build languagemodel of word -> [enc, enc, ...]

sm = languagemodel.SyllableModel()
cmudict = cmudict.CMUDict()
syll_mgr = syllables.syllables()

print('Starting')
count = 0
for line in sys.stdin:
    parts = line.split('\t')
    text = parts[0]
    syllables = literal_eval(parts[1][:-1])  # newline
    text = tokens.clean(text)
    words = tokens.tokenize(text)
    words = tokens.fixtokens(words)
    words = tokens.hyphen(words, cmudict.syll_dict)
    clean = []
    for word in words:
        if word != ',':
            clean.append(word)
    count += 1
    if len(clean) != len(syllables):
        print("Line #: " + str(count))
        print(clean)
        print(syllables)
        continue
    for i in range(len(clean)):
        encs = []
        for s in syllables[i]:

コード例 #34

0

ファイルを表示

ファイル: sciencere.py プロジェクト: ameerwasi001/MagnetScript

""")


def help():
    print("""
Following are the all the commands that are avalible except the ones built into the functions.py

libraries() -This command shows you all libraries that are used in fuctions.py and are loaded into this at the start of this program's execution

file() -This command can execute your file and takes a single argument that being the path to the file

terminal() -This command executes commands directly on your terminal so that you wouldn't have to exit and enter program repeatedly.

exit() or quit() -Used simply to exit or quit the program. """)


def terminal(command):
    os.system(command)


while True:
    given = input('>>>')
    given = tokens.tokenize(given)
    try:
        exec(given)
    except Exception as error:
        print(''.join(
            format_exception(etype=type(error),
                             value=error,
                             tb=error.__traceback__)))

コード例 #35

0

ファイルを表示

def parse(s):
    toks = tokens.tokenize(s)
    result = parse_expression(toks)
    if not toks[0].isStop():
        raise InputError("Expected operator or end of input", toks[0])
    return result

コード例 #36

0

ファイルを表示

ファイル: primitives.py プロジェクト: jasontbradshaw/plinth

def parse_(s):
    '''Parse a string into a list of the S-expressions it describes.'''
    util.ensure_type(basestring, s)
    return lang.Cons.build(*parse(tokens.tokenize(s)))