Python tokenizer Examples, tokenizer.tokenizer Python Examples

Example #1

0

Show file

File: search.py Project: VarunWachaspati/WhatsInTheVector

def banner():
    while True:
        print "1. Press 1 for running the tokenizer usually"
        print "2. Press 2 for creating the inverted index"
        print "3. Press 3 for creating the vectors for the documents"
        print "4. Press any other number to search"
        choice =  int(raw_input("$ "))
        if not os.path.exists(TEXT):
            print "No Data at All. No Valid Corpus. Please add something to\n" + str(TEXT)
        if not os.path.exists(DATA_PATH):
            print "No Data Existed, Path Created"
            os.mkdir(DATA)
        if not os.path.exists(TOKENS) or choice == 1:
            print "Creating tokens as they don't exist/You Chose To"
            os.mkdir(TOKENS)
            tokenizer()
        if not os.path.exists(INDICES) or choice == 2:
            print "Creating indices as they don't exist"
            os.mkdir(INDICES)
            counter()
        if not os.path.exists(SCORES) or choice == 3:
            print "Creating Vectors as they don't exist"
            os.mkdir(SCORES)
            score()
            mod()
        if choice > 3 or choice == 0:
            print "Search begins"
            break

Example #2

0

Show file

def main():

    # Take command line arguments and check for 2 args
    args = sys.argv[1:]

    if (len(args) == 1):
        file_in = args[0]
    else:
        print("Invalid arguments")
        exit(-1)

    stopwords_fp = "../text/stopwords.txt"
    file_in = args[0]
    file_out = 'tokenized.txt'

    # Read text from input file to string
    input_file = open(file_in, 'r')
    stopwords_file = open(stopwords_fp, 'r')
    text = ''
    stopwords = ''

    for line in input_file:
        text = text + line

    for line in stopwords_file:
        stopwords = stopwords + line

    # Tokenize string
    tokenized_text = tokenizer.tokenizer(text, stopwords)

    # Output tokenized text to file_out
    output_file = open(file_out, 'w')
    for line in tokenized_text:
        output_file.write(line)

Example #3

0

Show file

File: compiler_main.py Project: sanginovs/Small-Compiler

def main():
    user_input = input("Please, type your Lisp-like function:")
    tokens = tokenizer(user_input)
    abs_syntax_tree = parser(tokens)
    new_ast = transformer(abs_syntax_tree)
    output = codeGenerator(new_ast)
    return output

Example #4

0

Show file

def tokenStream(root, maxPages):
    urlDex = {}
    docTotal = 0
    for page in selfCrawler.crawl(root, maxPages):
        docTotal += 1
        raw = page.raw
        ID = page.ID
        url = page.url
        urlDex[ID] = url
        txtIDPair = {}
        print(ID)
        if raw != None:
            try:
                txtIDPair["TEXT"] = extractor.ConcordiaPageExtract(raw)
            except Exception:
                print(Exception, len(raw))
                txtIDPair["TEXT"] = None
                print(ID, " : ", url)
        else:
            txtIDPair["TEXT"] = None
        if txtIDPair["TEXT"] == None:
            docLengths[ID] = 0
        else:
            docLengths[ID] = len(txtIDPair['TEXT'])
        txtIDPair["ID"] = ID
        if txtIDPair["TEXT"] != None:
            docLengths[txtIDPair["ID"]] = len(txtIDPair["TEXT"])
        for token in tokenizer.tokenizer(txtIDPair):
            token = (token[0], token[1].lower())

            yield token
    with open("urls.json", 'w') as map:
        x = json.dumps(urlDex, sort_keys=True, indent=2)
        map.write(x)

Example #5

0

Show file

File: PhraseArtifact.py Project: catskillsresearch/openasr20

 def __init__(self, _config, _value):
     super().__init__(_config, _value)
     self.tokens = tokenizer(_value)
     self.value = (' '.join(self.tokens)).lower()
     self.n_words = len(self.tokens)
     self.n_graphemes = len(self.value)
     self.romanized = unidecode(self.value)

Example #6

0

Show file

File: part_b.py Project: scbenet/tokenizer

def main():

    # Take command line arguments and check for 2 args
    args = sys.argv[1:]

    if (len(args) == 1):
        file_in = args[0]
    else:
        print("Invalid arguments")
        exit(-1)

    stopwords_fp = "../text/stopwords.txt"
    file_in = args[0]
    file_out = 'terms.txt'

    # Read text from input file to string
    input_file = open(file_in, 'r')
    stopwords_file = open(stopwords_fp, 'r')
    text = ''
    stopwords = ''

    for line in input_file:
        text = text + line

    for line in stopwords_file:
        stopwords = stopwords + line

    # Tokenize string
    tokenized_text = tokenizer.tokenizer(text, stopwords)

    # Generate and write list of top 200 most frequent terms
    top_200(tokenized_text, file_out)
    vocabulary_growth(tokenized_text)

Example #7

0

Show file

File: tests.py Project: Sighter/stress

    def test_tokenizer(self):
        e1 = "to sentence here 12:30 3h 15min tags:tag1,tag3 , tag4"
        e2 = "plain todo"

        target = [
            ("WORD", "to"),
            ("WORD", "sentence"),
            ("WORD", "here"),
            ("INTEGER", 12),
            ("TAG_MARKER", ":"),
            ("INTEGER", 30),
            ("INTEGER", 3),
            ("TIME_UNIT", "h"),
            ("INTEGER", 15),
            ("TIME_UNIT", "min"),
            ("WORD", "tags"),
            ("TAG_MARKER", ":"),
            ("WORD", "tag1"),
            ("WORD", "tag3"),
            ("WORD", "tag4"),
        ]

        tok = tokenizer(e1)

        self.assertEqual(tok.get_token_list(), target)
        self.assertEqual(tok.tag_marker_count, 2)

Example #8

0

Show file

 def runSingleFile(self, isWrite=True):
     tokens = tokenizer(self.content)
     engine = CompilationEngine(tokens)
     pdb.set_trace()
     if isWrite:
         with open(self.outputPath, 'w') as f:
             f.write(''.join(asmCmds))

Example #9

0

Show file

def single_test(jack_path):
    with open(jack_path, 'r') as f:
        content = f.read()
    tokens = tokenizer(content)
    ce = CompilationEngine(tokens)
    exps = ce.expressions
    print(len(exps))
    print(exps[0].show())

Example #10

0

Show file

def inverse_index(data):
    d_map = defaultdict(list)

    for idx, val in enumerate(data):
        for word in tokenizer(val):
            d_map[word].append(idx)

    return d_map

Example #11

0

Show file

 def __init__(self, inputPath):
     with open(inputPath,'r') as f:
         content = f.read()
     self.tokens = tokenizer(content)
     self.curIdx = 0
     self.XMLArr = []
     self.compileClass()
     self.XML = ''.join(self.XMLArr)

Example #12

0

Show file

File: sentiment.py Project: dav92lee/cs1951aFinal

 def getSentiment(self,text):
     text_tokens = tokenizer(text)
     score = 0.0
     for token in text_tokens:
         if token in self.sent_dict:
             score += self.sent_dict[token]
     if len(text_tokens) == 0:
         return 0
     return score/(len(text_tokens)*5)

Example #13

0

Show file

File: sentiment.py Project: dav92lee/cs1951aFinal

 def __init__(self):
     self.sent_dict=dict()
     sent_file = open(SENTIMENT_FILE)
     for line in sent_file:
         term,score = line.split('\t')
         tokens = tokenizer(term)
         if len(tokens)>0:
             term_token = tokens[0]
         self.sent_dict[term_token] = float(score)

Example #14

0

Show file

File: entry.py Project: Sighter/stress

    def parse_str(self, raw_string):
        """parse the given string and return a new entry instance"""
        self.tok = tokenizer(raw_string)

        # parse string and write the results in temp vars
        # read methods append the results n(to the coresponding vars
        while (True):
            
            t = self.tok.reveal_next_token()
            if t == None:
                break
            
            if t[0] == "INTEGER":
                con = self._read_time_duration()
                if con == True:
                    continue

            if t[0] != "INTEGER" and t[1].upper() in ["TAGS", "TAG", "T"]:
                con = self._read_tags()
                if con == True:
                    continue
                else:
                    self.tm_count += 1;

            # read msg
            self.msg.append(t[1])
            self.tok.consume_next_token()

            
        # build msg
        new_entry = entry()

        for word in self.msg:
            new_entry.msg += str(word) + " "

        new_entry.msg = new_entry.msg.strip()
            
        # build timedelta objects and sum them
        complete_duration = datetime.timedelta()

        for d in self.durations:
            if d[1] in HOUR_UNITS:
                cur_dur = datetime.timedelta(hours = d[0])
            elif d[1] in MIN_UNITS:
                cur_dur = datetime.timedelta(minutes = d[0])

            complete_duration += cur_dur

        new_entry.duration = complete_duration

        # build entry
        new_entry.tags = self.tags

        # reset the parser and return the new entry
        self.reset()

        return new_entry

Example #15

0

Show file

File: tokenizefiles.py Project: ajitisack/search_engine

def tokenize_file(filename: str):
    """ Tokenize words in a file """
    line = ft.read_file(filename)
    words = ts.tokenizer(line)
    df = pd.DataFrame(FreqDist(words).items(), columns=['token', 'freq'])
    df = df.dropna()
    df['doc'] = filename
    df['n_words'] = len(words)
    df['tf'] = df['freq'] / df['n_words']
    print(f"Tokenized - {filename}")
    return df

Example #16

0

Show file

File: utils.py Project: AlexisDelaforge/NLPWorkspace

 def __init__(self, name=None, eos='<eos>', sos='<sos>', unk='<unk>', pad='<pad>', tok_type='spacy', lower=True):
     self.tokenizer = tokenizer(tok_type)
     self.eos = eos
     self.sos = sos
     self.pad = pad
     self.unk = unk
     self.name = name
     self.lower = lower
     self.word2index = {self.sos: 0, self.eos: 1, self.pad: 2, self.unk: 3}
     self.word2count = {}
     self.index2word = {0: self.sos, 1: self.eos, 2: self.pad, 3: self.unk}
     self.n_words = 4  # Count SOS and EOS

Example #17

0

Show file

File: inverted_index.py Project: OmerAli277/Inverted-Index

    def parse_documents(self):
        print('Parsing the documents and creating Hashmap index ...')
        tokenizer_obj = tokenizer()
        for i in range(len(self.all_documents)):
            self.doc_ids[i + 1] = self.all_documents[i]
            tokens = tokenizer_obj.parse(self.all_documents[i])
            # Creating Index while parsing
            self.addto_hashmap_index(i + 1, tokens)

        print('Saving the term id and document id files...')
        # Files of term id and document id
        self.make_files_of_terms_and_documents()

Example #18

0

Show file

File: twitter_tokenize.py Project: talha252/tur-text-norm

def main():
    parser = make_parser(prog="Twitter Tokenizer")
    args = parser.parse_args()
    # TODO: add function checks and exceptions
    module = load_module(args.writer)
    writer = module.writer
    module = load_module(args.reader)
    reader = module.reader()

    for tweet in reader:
        result = tokenizer(tweet, args.tokens, args.verbose)
        writer(result)

Example #19

0

Show file

File: main.py Project: lollipush/cdecl_game

def main(argv):
    if len(argv) >= 2:
        with open(argv[1], 'r') as f:
            grammar = read_bnf(f.read())
        print_bnf(grammar)
        if len(argv) >= 3:
            tks = list(tokenizer(argv[1]))
            for tk in tks:
                print tk
            print
            ast = parser(tks)
            print ast

Example #20

0

Show file

File: miner.py Project: uzairalam15/Python-Generic-Web-Mining

def tokenizer_execute(language, page_html, link):
    obj = tokenizer.tokenizer(language)
    obj.generate_tokens()
    obj2 = tokenizer.semantic_tokenizer(obj.tokens)
    obj2.generate_tokens()
    obj3 = tokenizer.extractor(obj.tokens, obj2.semantic_tokens)
    if not page_html:
        returned_result = obj3.start_extract(link)
        return returned_result
    else:
        returned_result = obj3.start_extract_without_fetch(page_html)
        return returned_result

Example #21

0

Show file

File: app.py Project: cdsl-research/slackbot

def handle_mentions(body, say):
    raw_message = body["event"]["text"]
    tokenized_message = tokenizer.tokenizer(raw_message)
    tokenized_message_types = [x.type for x in tokenized_message]
    # print(tokenized_message)

    if tokenized_message_types == ["USERNAME", "QR", "URL"]:
        target_url = tokenized_message[2].value
        say(blocks=[{
            "type": "image",
            "title": {
                "type": "plain_text",
                "text": f"QR Code of {target_url}"
            },
            "block_id": f"image-{target_url}",
            "image_url": QR_BASE_URL + target_url,
            "alt_text": "QR Code"
        }])
    elif tokenized_message_types == ["USERNAME", "OMIKUJI"]:
        with open("omikuji_result.json") as f:
            omikuji_result = json.load(f)["omikuji"]
        chose_result = random.choice(omikuji_result)
        say(blocks=[{
            "type": "image",
            "title": {
                "type": "plain_text",
                "text": chose_result["text"]
            },
            "block_id": "image",
            "image_url": chose_result["image"],
            "alt_text": "Image " + chose_result["text"]
        }])
    elif tokenized_message_types == ["USERNAME", "GAKUSEKI", "STUDENT_ID"]:
        students = member_list.get_members()
        student_id = tokenized_message[2].value
        the_student = students.get(student_id)
        if the_student is None:
            say("Not found.")
        else:
            s_email = the_student["email"]
            s_real_name = the_student["real_name"]
            say(blocks=[{
                "type":
                "section",
                "fields": [{
                    "type": "mrkdwn",
                    "text": f":e-mail:*Email:*\n{s_email}",
                }, {
                    "type": "mrkdwn",
                    "text": f":pencil:*Real Name:*\n{s_real_name}",
                }]
            }])

Example #22

0

Show file

def process_file(fileName, mine_type):
    worker = tokenizer(fileName)
    
    while True:
        word = worker.nextWord()
        if word == None:
            break
        log('# ' + word)
        if mine_type == cool_mine:
            if knowndb.find(word) == False:
                unknowndb.add(word)
        else:
            knowndb.add(word)

Example #23

0

Show file

def calculator():
    while True:
        expr = get_input()
        tokens = tokenizer(expr)
        parentheses = []
        for tok in tokens:
            if tok == "(" or tok == ")":
                parentheses.append(tok)
        if not paren_check("".join(parentheses)):
            print("Unmatched parentheses")
        if tokens is not None or len(tokens) > 0:
            postfix = infixtopostfix(tokens)
            evaluation = postfixevaluation(postfix)
        print("press CTRL + C to quit, else press any key to continue")

Example #24

0

Show file

File: search.py Project: ajitisack/search_engine

def search(search_query):
    """ Searches for normalized words in the search query in tokens table and 
    returns the doc name in the descending order of sum(freq) of query words"""
    keywords = "','".join(ts.tokenizer(search_query))
    db_conn = db.create_db_conn("MSSQL")
    tblname = f"{db.DBNAME}.{db.SCHEMA}.tokens"
    sql = f""\
    f"SELECT doc, sum(freq) "\
    f"FROM {tblname} "\
    f"where token in ('{keywords}') "\
    f"group by doc "\
    f"order by 2 desc"
    print(sql)
    df = pd.read_sql_query(sql, db_conn)
    return (df)

Example #25

0

Show file

File: parser.py Project: MyvarOS/SednaCompiler

def main(argv):
	print('Sedna Parser')
	if len(argv) < 2:
		print('command line syntax: <input-source> ...')
		return

	print('---start---')

	for inp in argv[1:]:
		fd = open(inp, 'rb')
		data = fd.read().decode('utf-8')
		fd.close()

		ast = tokenizer.tokenizer(data)
		print(json.dumps(ast))
		'''

Example #26

0

Show file

File: expansion.py Project: jcheng31/3245-homework-4

    def matches(self, term):
        """Given a term, returns a list of postings for that term and its
        synonyms.

        We find synonyms for the given term, obtain their postings, and merge
        them with the \"original\" list of postings for the term itself."""
        # Obtain the postings list for this term.
        term_postings = \
            set(self.compound_index.postings_list(self.INDEX, term))

        # Convert the postings list into a dictionary,
        # where keys are doc IDs and values are term counts.
        posting_dict = {}
        for posting in term_postings:
            posting_dict[posting[0]] = posting[1]

        # Find synonyms of the term from our thesaurus.
        thesaurus = Thesaurus()
        unstemmed = self.stemmed_unstemmed_map(self.INDEX)[term]
        synonyms = thesaurus[unstemmed]

        for synonym in synonyms:
            # Get the postings for this synonym.
            stemmed_synonym = tokenizer(synonym)[0]
            postings = self.compound_index.postings_list(
                self.INDEX, stemmed_synonym)

            # Update the existing postings with each synonym's count.
            for posting in postings:
                doc_id = posting[0]
                count = posting[1]

                if doc_id in posting_dict:
                    posting_dict[doc_id] += count
                else:
                    posting_dict[doc_id] = count

        # Convert the posting dictionary back into a posting list.
        combined_postings = []
        for doc_id, count in posting_dict.iteritems():
            combined_postings.append([doc_id, count])

        return sorted(combined_postings)

Example #27

0

Show file

def run(sq):
    url_list =[
        "https://isha.sadhguru.org/us/en/wisdom/article/what-to-eat-making-right-food-choices",
        "https://www.pythonforbeginners.com/basics/getting-user-input-from-the-keyboard",
        "https://medium.com/center-for-data-science/deepmind-fellow-profile-ksenia-saenko-e6d0f7574a59",
        "https://medium.com/center-for-data-science/deepmind-fellow-profile-yassine-kadiri-7bfe4a045050"
        ]
    data = inverse_index(hit_urls(url_list))

    # update main map with words from the html pages, with their occurrences
    MAIN_MAP.update(data)

    query = tokenizer(sq)

    root = Node()
    ignore = ['©', '—', '’', '“', '”', "''"]

    for word in MAIN_MAP:
        if word not in ignore:
            add(root, word)

    retval = {}

    # search the compressed trie using the find function
    for key in query:
        if find(root, key):
            retval.update({key: MAIN_MAP[key]})

    resulting_idx = ranking(retval)

    if not resulting_idx:
        print(f'\n No results for your search query - {sq}')
        print('\n  Modify the query and try again, listed below are the searched URLs')

        for idx, ul in enumerate(url_list):
            print(f'{idx+1}.{ul}')

        return

    print("\n Search results, in decreasing order of relevance \n")
    for idx, val in enumerate(resulting_idx):
        print(f'{idx+1}: {url_list[val]}')

Example #28

0

Show file

File: search.py Project: jcheng31/3245-homework-4

    def get_tokens_for(self, index, unstemmed=False):
        """Given an index (title or abstract), returns a list of tokens
        from the words contained in that index.

        By default, this will return case-folded and stemmed tokens. If
        the unstemmed argument is set to True, the original words will be
        returned instead."""
        raw_text = self.__text.get(index)

        if unstemmed:
            # We need to return just the words, without any punctuation.
            words = raw_text.split()
            stripped = [x.strip(string.punctuation) for x in words]
            return stripped

        tokens = tokenizer(raw_text)

        # We want to strip out tokens which consist of just
        # punctuation characters.
        return [x for x in tokens if x not in string.punctuation]

Example #29

0

Show file

File: tree_builder.py Project: bentondavidl/SymbolicAlgebra

def solve(equation):

    expression = tokenizer(equation)
    left_side = right_side = None
    comp = None
    for i, token in enumerate(expression):
        if token.type == 'Comparator':
            left_side = build(expression[:i])
            right_side = build(expression[i + 1:])
            comp = token.value
            break
    if comp is None:
        expression = build(expression)
        print(f'The answer is {solve_side(expression)}')
    else:
        if comp == '=':
            comp = '=='
        result = eval(
            f'{solve_side(left_side)} {comp} {solve_side(right_side)}')
        print(f'The equation is {result}')

Example #30

0

Show file

File: testing.py Project: ketio/ir2015

def NBtesting(doc_string):

	current_file_path = os.path.dirname(os.path.abspath(__file__))

	# total document number
	# documentCount = len(testing_docs)

	# result is a dictionary, used to store the testing result.
	# i.e `result` = { doc_1: class_of_doc_1, doc_2: class_of_doc_2,...}
	result = dict()

	classes = ["1", "2", "3", "5"]

	# (new_V, prior, condprob)
	#	condprob[t][_class] term,class,prob

	f_v = open(current_file_path+"/training_result/v.txt", "r")
	v = f_v.read().decode("utf-8").split(",")
	
	f_prior = open(current_file_path+"/training_result/prior.txt", "r")
	prior = dict()
	for row in csv.DictReader(f_prior):
		prior[row["class"]]=float(row["prob"])


	f_condprob = open(current_file_path+"/training_result/condprob.txt", "r")
	condprob = dict()
	for row in csv.DictReader(f_condprob):
		term = row["term"].decode("utf-8")
		_class = row["class"]
		prob = float(row["prob"])
		
		if term not in condprob:
			condprob[term]=dict()
		condprob[term][_class] = prob

	doc_terms = tokenizer.tokenizer(doc_string)
	
	result = naive_bayes.ApplyMultinomialNB(classes, v, prior, condprob, doc_terms)

	return result

Example #31

0

Show file

    def parse_input(self):
        assign_token = []
        assign_value = []

        input_lines = input(">>: ")  # 3,4
        input_tokens = tokenizer(input_lines)

        for token in input_tokens:
            if not token[1] == "comma":
                if token[1] == 'bool':
                    val = (token[0].replace("\"", ""), token[1])
                    token = val
                assign_value.append(token)

        i = 0
        assign_token.append(self.current_token)
        assign_token.append(("=", "assignment"))
        assign_token.append(assign_value[i])
        assign_token.append(("INPUT", "INPUT"))
        self.keep("identifier")
        i += 1
        while self.current_token[1] == 'comma':
            self.keep("comma")
            assign_token.append(self.current_token)
            assign_token.append(("=", "assignment"))
            self.keep("identifier")
            if assign_value[i][1] != 'EOF':
                assign_token.append(assign_value[i])
                assign_token.append(("INPUT", "INPUT"))
            else:
                self.error("\nExpected more inputs")
            i += 1

        if (len(assign_value) - 1) != i:
            # input values is greater than identifiers
            self.error("\nExpected less inputs")

        for i in range(0, len(assign_token), 4):
            input_assign = Parser(assign_token[i:i + 4])
            input_assign.parse_assign()

Example #32

0

Show file

def batch_test():
    for jack_path, target_path in [
        [
            '../test/ArrayTest/Main.jack',
            '../test/engine_test/array_main_actual.xml'
        ],
        [
            '../test/Square/Main.jack',
            '../test/engine_test/square_main_actual.xml'
        ],
        [
            '../test/Square/Square.jack',
            '../test/engine_test/square_actual.xml'
        ],
        [
            '../test/Square/SquareGame.jack',
            '../test/engine_test/square_game_actual.xml'
        ],
        [
            '../test/ExpressionLessSquare/Main.jack',
            '../test/engine_test/exp_main_actual.xml'
        ],
        [
            '../test/ExpressionLessSquare/Square.jack',
            '../test/engine_test/exp_actual.xml'
        ],
        [
            '../test/ExpressionLessSquare/SquareGame.jack',
            '../test/engine_test/exp_game_actual.xml'
        ]
    ]:
        print(jack_path)
        print(target_path)
        with open(jack_path, 'r') as f:
            content = f.read()
        tokens = tokenizer(content)
        CompilationEngine(tokens).treeToXml(target_path)

Example #33

0

Show file

File: SpellNazi.py Project: rra94/SpellNazi

import unicodedata
import nltk
import enchant
import os

import tokenizer
import metaphone
import plausibleWords
import dictionarySearch
import getngrams

splitSentencesArray = tokenizer.tokenizer()
incorrectlySpelled = dictionarySearch.dictionarySearch(splitSentencesArray)
correctlySpelled = []
summationArray = []
outputSentencesArray = splitSentencesArray
tempSum = 0.0

for i in xrange(len(incorrectlySpelled)):
	plausibleList = plausibleWords.plausibleWords(splitSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]])
	for j in xrange(len(plausibleList)):
		if incorrectlySpelled[i][1] != 0 and incorrectlySpelled[i][1] != (len(splitSentencesArray[incorrectlySpelled[i][0]]) - 1):
			testString1 = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + plausibleList[j] + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
			testString = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + " " + plausibleList[j] + " " + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
		elif incorrectlySpelled[i][1] == 0:
			testString1 = plausibleList[j] + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
			testString = plausibleList[j] + " " + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
		else:
			testString1 = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + plausibleList[j]
			testString = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + " " + plausibleList[j]
		os.system("getngrams.py " + testString + " -noprint -quit")

Example #34

0

Show file

File: preprocess_stemming.py Project: gcisantos/MTPIC

def preProcess(texto, linguagem):
    return stemming.stemmer(
        removalStopwords.removalStopwords(tokenizer.tokenizer(texto),
                                          linguagem), linguagem)

Example #35

0

Show file

File: __main__.py Project: ashitr/From-Nand-to-Tetris

def _is_jack_file(file):
    return file.split('.')[1] == 'jack'


def _get_jack_files(arg):
    if _is_file(arg):
        files = [arg]
    else:
        content_in_directory = \
            ['{}/'.format(arg) + content for content in os.listdir(arg)]
        files = [file for file in content_in_directory if _is_file(file)]

    return (file for file in files if _is_jack_file(file))


def _get_output_filename(arg):
    return '{}.vm'.format(arg.split('.jack')[0])


##################--------- DRIVER CODE  -------######################  # NOQA

if __name__ != '__main__':
    print 'Please run as a self-conatined program'

jack_files = _get_jack_files(sys.argv[1])

for file in jack_files:
    token_gen = tokenizer(file)
    output_file = open(_get_output_filename(file), 'w')
    compile_file(output_file, token_gen)

Example #36

0

Show file

File: assembler.py Project: VladisM/MARK_II

    def pass0(self):

        #read input file and parse it (also invoke preprocesor)
        self.t = tokenizer.tokenizer()
        self.t.parse(self.mainFileName)

Example #37

0

Show file

File: parser.py Project: alco/aether

        return "-"

def binary_op():
    #print "Parsing bin op with %s" % sym
    if sym.type == "+":
        return "+"
    elif sym.type == "-":
        return "-"
    elif sym.type == "*":
        return "*"
    elif sym.type == "/":
        return "/"

if __name__ == '__main__':
    import sys

    line = sys.stdin.read()
    t = tokenizer()(line)

    def f():
        global sym
        try:
            sym = map_sym(t.next())
            #print "Sym = %s" % sym
        except StopIteration:
            return
    getsym = f

    getsym()
    print expect(expr())

Example #38

0

Show file

File: main.py Project: apoorvumang/property_prediction

elif args.dataset == 'covid':
    folder_name = 'covid/'
    pos_weight = torch.tensor((290726 - 405) / 290726, dtype=torch.float32)
    # pos_weight = torch.tensor(1.0, dtype=torch.float32)
    valid = readData(folder_name + 'dev.csv')
    test = readData(folder_name + 'test.csv')
    train = readData(folder_name + 'train.csv')
elif args.dataset == 'for_submission':
    folder_name = 'for_submission/'
    pos_weight = torch.tensor((2335 - 120) / 120, dtype=torch.float32)
    # pos_weight = torch.tensor(1.0, dtype=torch.float32)
    valid = readData(folder_name + 'dev.csv')
    test = readData(folder_name + 'train.csv')
    train = readData(folder_name + 'train.csv')

tk = tokenizer(train + valid + test)

# def make_batch(data):
#     X = []
#     Y = []
#     lengths = []
#     weights = []
#     for d in data:
#         x, x_len = tk.tokenize(d[0])
#         y = d[1]
#         Y.append(float(y))
#         # X.append(strToLong(x, char2int, max_length))
#         X.append(x)
#         lengths.append(x_len)
#     X = np.stack(X, axis=0)
#     Y = np.array(Y)

Example #39

0

Show file

File: markov_chain.py Project: MakeSchool-17/twitter-bot-python-KavinSub

        # Add the next word to the sequence
        sequence += "{} ".format(next_state.split()[0])
        curr_state = next_state
        lower_bound = 0.0
        l += 1
        # Ensures the sentence generated ends appropriately
        if l >= k and next_state[len(next_state) - 1] is '.':
            end = True
    return sequence
if __name__ == '__main__':
    # Command line args conditionals
    if len(sys.argv) >= 3:
        filename = sys.argv[1]
        N = int(sys.argv[2])
    elif len(sys.argv) == 2:
        filename = sys.argv[1]
        N = 1
    else:
        filename = "markov_test_1.txt"
        N = 1
    begin_state = ""
    directory = "text_files/" + filename
    # Deque object used to store n-token states
    tokens = tokenizer.tokenizer(directory)
    markovchain, begin_state = create_map(tokens, N)
    #pairs = markovchain.get_pairs()
    #for i in range(50):
        #print("{} contains {}".format(pairs[i][0], pairs[i][1]))
    generated_string = generate_sequence(markovchain, None, 25)
    print(generated_string)

Example #40

0

Show file

File: tests.py Project: Sighter/stress

    def test_reveal_offset(self):
        e1 = "to sentence here 12:30 3h 15min tags:tag1,tag3 , tag4"

        tok = tokenizer(e1)

        self.assertEqual(tok.reveal_next_token(1), ("WORD", "sentence"))

Example #41

0

Show file

                i,
                j[0],
                k[0],
            ))
            for x in range(1, l):
                print('%20s    %20s   %20s' % (
                    "",
                    "",
                    k[x],
                ))


if __name__ == "__main__":
    tokens = tokenizer.tokens
    table = {k: [0, set()] for k in tokens}
    lexer = lex.lex(module=tokenizer())
    filename = sys.argv[1]
    if os.path.exists(filename):
        file = open(filename, 'r')
        data = file.read()
        lexer.input(data)
        while True:
            tokk = lexer.token()
            if not tokk: break
            table[tokk.type][0] += 1
            table[tokk.type][1].add(tokk.value)
        file.close()
    else:
        print("File Does Not Exist")
    Print()

Example #42

0

Show file

from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
import pandas as pd
import json

import tokenizer

import dummytokenize

import pickle

t = tokenizer.tokenizer()


class create_CNN():
    def __init__(self, X, Y, config, tokenizer=None):
        self.config_dict = json.load(open(config))
        self.lb = LabelEncoder()
        self.tfidf = TfidfVectorizer(tokenizer=tokenizer,
                                     preprocessor=tokenizer,
                                     token_pattern=None)

        self.x = X
        self.y = Y

    def fit_tfidf(self):
        print(self.x)

Example #43

0

Show file

        logging.basicConfig(format='[%(asctime)s.%(msecs)03d] %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S', level=numeric_level)
        logging.info('Created Logger level={}'.format(loglevel))
    else:
        logging.basicConfig(filename=logfile, format='[%(asctime)s.%(msecs)03d] %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S', level=numeric_level)
        logging.info('Created Logger level={} file={}'.format(loglevel, logfile))

        
################################################
### MAIN #######################################
################################################

if __name__ == '__main__':
    fin = None
    fout = None
    num_threads = 1
    t = tokenizer()

    usage = """usage: {} [-i FILE -o FILE -num_threads INT]  [tok_options]
    -i: (stdin)
    -o: (stdout)
    -num_threads: 1 (used only when -i and -o are used)
    -h: this message

  tok_options (See https://github.com/OpenNMT/Tokenizer for more details):
""".format(sys.argv.pop(0),t.tokopts)
    for k,v in t.tokopts.items():
        usage += "    -{}: {}\n".format(k,v)

    sys.argv = t.updateOpts(sys.argv)        
    while len(sys.argv):
        tok = sys.argv.pop(0)

Example #44

0

Show file

File: __main__.py Project: omarrayward/from-nand-to-tetris

def _is_jack_file(file):
    return file.split('.')[1] == 'jack'


def _get_jack_files(arg):
    if _is_file(arg):
        files = [arg]
    else:
        content_in_directory = \
            ['{}/'.format(arg) + content for content in os.listdir(arg)]
        files = [file for file in content_in_directory if _is_file(file)]

    return (file for file in files if _is_jack_file(file))


def _get_output_filename(arg):
    return '{}_output.xml'.format(arg.split('.jack')[0])

##################--------- DRIVER CODE  -------######################  # NOQA

if __name__ != '__main__':
    print 'Please run as a self-conatined program'

jack_files = _get_jack_files(sys.argv[1])

for file in jack_files:
    token_gen = tokenizer(file)
    output_file = open(_get_output_filename(file), 'w')
    compile_file(output_file, token_gen)

Example #45

0

Show file

File: zagatProcessing.py Project: dav92lee/cs1951aFinal

 stateOutput = open('states'+args.output+'.csv','w')
 inputcsv = csv.reader(csvInput)
 citycsv = csv.writer(cityOutput)
 statecsv = csv.writer(stateOutput)
 cityDict = dict()
 stateDict = dict()
 cityMeanDict = dict()
 stateMeanDict = dict()
 for row in inputcsv:
     if args.output == 'Yelp':
         [sentiment,city,state] = row
         if state not in states:
             continue
     elif args.output == 'Zagat':
         [sentiment,state,city] = row
     city = ' '.join(tokenizer(city))
     city = city+':'+state
     sentiment = float(sentiment)
     if city in cityDict:
         crrnt = cityDict[city]
         cityDict[city] = [crrnt[0]+sentiment,crrnt[1]+1]
     else:
         cityDict[city] = [sentiment,1]
     if state in stateDict:
         crrnt = stateDict[state]
         stateDict[state] = [crrnt[0]+sentiment,crrnt[1]+1.0]
     else:
         stateDict[state] = [sentiment,1.0]
 for k in cityDict:
     v = cityDict[k]
     mean = v[0]/v[1]