def banner():
    while True:
        print "1. Press 1 for running the tokenizer usually"
        print "2. Press 2 for creating the inverted index"
        print "3. Press 3 for creating the vectors for the documents"
        print "4. Press any other number to search"
        choice =  int(raw_input("$ "))
        if not os.path.exists(TEXT):
            print "No Data at All. No Valid Corpus. Please add something to\n" + str(TEXT)
        if not os.path.exists(DATA_PATH):
            print "No Data Existed, Path Created"
            os.mkdir(DATA)
        if not os.path.exists(TOKENS) or choice == 1:
            print "Creating tokens as they don't exist/You Chose To"
            os.mkdir(TOKENS)
            tokenizer()
        if not os.path.exists(INDICES) or choice == 2:
            print "Creating indices as they don't exist"
            os.mkdir(INDICES)
            counter()
        if not os.path.exists(SCORES) or choice == 3:
            print "Creating Vectors as they don't exist"
            os.mkdir(SCORES)
            score()
            mod()
        if choice > 3 or choice == 0:
            print "Search begins"
            break
Example #2
0
def main():

    # Take command line arguments and check for 2 args
    args = sys.argv[1:]

    if (len(args) == 1):
        file_in = args[0]
    else:
        print("Invalid arguments")
        exit(-1)

    stopwords_fp = "../text/stopwords.txt"
    file_in = args[0]
    file_out = 'tokenized.txt'

    # Read text from input file to string
    input_file = open(file_in, 'r')
    stopwords_file = open(stopwords_fp, 'r')
    text = ''
    stopwords = ''

    for line in input_file:
        text = text + line

    for line in stopwords_file:
        stopwords = stopwords + line

    # Tokenize string
    tokenized_text = tokenizer.tokenizer(text, stopwords)

    # Output tokenized text to file_out
    output_file = open(file_out, 'w')
    for line in tokenized_text:
        output_file.write(line)
def main():
    user_input = input("Please, type your Lisp-like function:")
    tokens = tokenizer(user_input)
    abs_syntax_tree = parser(tokens)
    new_ast = transformer(abs_syntax_tree)
    output = codeGenerator(new_ast)
    return output
Example #4
0
def tokenStream(root, maxPages):
    urlDex = {}
    docTotal = 0
    for page in selfCrawler.crawl(root, maxPages):
        docTotal += 1
        raw = page.raw
        ID = page.ID
        url = page.url
        urlDex[ID] = url
        txtIDPair = {}
        print(ID)
        if raw != None:
            try:
                txtIDPair["TEXT"] = extractor.ConcordiaPageExtract(raw)
            except Exception:
                print(Exception, len(raw))
                txtIDPair["TEXT"] = None
                print(ID, " : ", url)
        else:
            txtIDPair["TEXT"] = None
        if txtIDPair["TEXT"] == None:
            docLengths[ID] = 0
        else:
            docLengths[ID] = len(txtIDPair['TEXT'])
        txtIDPair["ID"] = ID
        if txtIDPair["TEXT"] != None:
            docLengths[txtIDPair["ID"]] = len(txtIDPair["TEXT"])
        for token in tokenizer.tokenizer(txtIDPair):
            token = (token[0], token[1].lower())

            yield token
    with open("urls.json", 'w') as map:
        x = json.dumps(urlDex, sort_keys=True, indent=2)
        map.write(x)
 def __init__(self, _config, _value):
     super().__init__(_config, _value)
     self.tokens = tokenizer(_value)
     self.value = (' '.join(self.tokens)).lower()
     self.n_words = len(self.tokens)
     self.n_graphemes = len(self.value)
     self.romanized = unidecode(self.value)
Example #6
0
def main():

    # Take command line arguments and check for 2 args
    args = sys.argv[1:]

    if (len(args) == 1):
        file_in = args[0]
    else:
        print("Invalid arguments")
        exit(-1)

    stopwords_fp = "../text/stopwords.txt"
    file_in = args[0]
    file_out = 'terms.txt'

    # Read text from input file to string
    input_file = open(file_in, 'r')
    stopwords_file = open(stopwords_fp, 'r')
    text = ''
    stopwords = ''

    for line in input_file:
        text = text + line

    for line in stopwords_file:
        stopwords = stopwords + line

    # Tokenize string
    tokenized_text = tokenizer.tokenizer(text, stopwords)

    # Generate and write list of top 200 most frequent terms
    top_200(tokenized_text, file_out)
    vocabulary_growth(tokenized_text)
Example #7
0
    def test_tokenizer(self):
        e1 = "to sentence here 12:30 3h 15min tags:tag1,tag3 , tag4"
        e2 = "plain todo"

        target = [
            ("WORD", "to"),
            ("WORD", "sentence"),
            ("WORD", "here"),
            ("INTEGER", 12),
            ("TAG_MARKER", ":"),
            ("INTEGER", 30),
            ("INTEGER", 3),
            ("TIME_UNIT", "h"),
            ("INTEGER", 15),
            ("TIME_UNIT", "min"),
            ("WORD", "tags"),
            ("TAG_MARKER", ":"),
            ("WORD", "tag1"),
            ("WORD", "tag3"),
            ("WORD", "tag4"),
        ]

        tok = tokenizer(e1)

        self.assertEqual(tok.get_token_list(), target)
        self.assertEqual(tok.tag_marker_count, 2)
Example #8
0
 def runSingleFile(self, isWrite=True):
     tokens = tokenizer(self.content)
     engine = CompilationEngine(tokens)
     pdb.set_trace()
     if isWrite:
         with open(self.outputPath, 'w') as f:
             f.write(''.join(asmCmds))
Example #9
0
def single_test(jack_path):
    with open(jack_path, 'r') as f:
        content = f.read()
    tokens = tokenizer(content)
    ce = CompilationEngine(tokens)
    exps = ce.expressions
    print(len(exps))
    print(exps[0].show())
Example #10
0
def inverse_index(data):
    d_map = defaultdict(list)

    for idx, val in enumerate(data):
        for word in tokenizer(val):
            d_map[word].append(idx)

    return d_map
Example #11
0
 def __init__(self, inputPath):
     with open(inputPath,'r') as f:
         content = f.read()
     self.tokens = tokenizer(content)
     self.curIdx = 0
     self.XMLArr = []
     self.compileClass()
     self.XML = ''.join(self.XMLArr)
Example #12
0
 def getSentiment(self,text):
     text_tokens = tokenizer(text)
     score = 0.0
     for token in text_tokens:
         if token in self.sent_dict:
             score += self.sent_dict[token]
     if len(text_tokens) == 0:
         return 0
     return score/(len(text_tokens)*5)
Example #13
0
 def __init__(self):
     self.sent_dict=dict()
     sent_file = open(SENTIMENT_FILE)
     for line in sent_file:
         term,score = line.split('\t')
         tokens = tokenizer(term)
         if len(tokens)>0:
             term_token = tokens[0]
         self.sent_dict[term_token] = float(score)
Example #14
0
    def parse_str(self, raw_string):
        """parse the given string and return a new entry instance"""
        self.tok = tokenizer(raw_string)

        # parse string and write the results in temp vars
        # read methods append the results n(to the coresponding vars
        while (True):
            
            t = self.tok.reveal_next_token()
            if t == None:
                break
            
            if t[0] == "INTEGER":
                con = self._read_time_duration()
                if con == True:
                    continue

            if t[0] != "INTEGER" and t[1].upper() in ["TAGS", "TAG", "T"]:
                con = self._read_tags()
                if con == True:
                    continue
                else:
                    self.tm_count += 1;

            # read msg
            self.msg.append(t[1])
            self.tok.consume_next_token()

            
        # build msg
        new_entry = entry()

        for word in self.msg:
            new_entry.msg += str(word) + " "

        new_entry.msg = new_entry.msg.strip()
            
        # build timedelta objects and sum them
        complete_duration = datetime.timedelta()

        for d in self.durations:
            if d[1] in HOUR_UNITS:
                cur_dur = datetime.timedelta(hours = d[0])
            elif d[1] in MIN_UNITS:
                cur_dur = datetime.timedelta(minutes = d[0])

            complete_duration += cur_dur

        new_entry.duration = complete_duration

        # build entry
        new_entry.tags = self.tags

        # reset the parser and return the new entry
        self.reset()

        return new_entry
Example #15
0
def tokenize_file(filename: str):
    """ Tokenize words in a file """
    line = ft.read_file(filename)
    words = ts.tokenizer(line)
    df = pd.DataFrame(FreqDist(words).items(), columns=['token', 'freq'])
    df = df.dropna()
    df['doc'] = filename
    df['n_words'] = len(words)
    df['tf'] = df['freq'] / df['n_words']
    print(f"Tokenized - {filename}")
    return df
Example #16
0
 def __init__(self, name=None, eos='<eos>', sos='<sos>', unk='<unk>', pad='<pad>', tok_type='spacy', lower=True):
     self.tokenizer = tokenizer(tok_type)
     self.eos = eos
     self.sos = sos
     self.pad = pad
     self.unk = unk
     self.name = name
     self.lower = lower
     self.word2index = {self.sos: 0, self.eos: 1, self.pad: 2, self.unk: 3}
     self.word2count = {}
     self.index2word = {0: self.sos, 1: self.eos, 2: self.pad, 3: self.unk}
     self.n_words = 4  # Count SOS and EOS
    def parse_documents(self):
        print('Parsing the documents and creating Hashmap index ...')
        tokenizer_obj = tokenizer()
        for i in range(len(self.all_documents)):
            self.doc_ids[i + 1] = self.all_documents[i]
            tokens = tokenizer_obj.parse(self.all_documents[i])
            # Creating Index while parsing
            self.addto_hashmap_index(i + 1, tokens)

        print('Saving the term id and document id files...')
        # Files of term id and document id
        self.make_files_of_terms_and_documents()
def main():
    parser = make_parser(prog="Twitter Tokenizer")
    args = parser.parse_args()
    # TODO: add function checks and exceptions
    module = load_module(args.writer)
    writer = module.writer
    module = load_module(args.reader)
    reader = module.reader()

    for tweet in reader:
        result = tokenizer(tweet, args.tokens, args.verbose)
        writer(result)
Example #19
0
def main(argv):
    if len(argv) >= 2:
        with open(argv[1], 'r') as f:
            grammar = read_bnf(f.read())
        print_bnf(grammar)
        if len(argv) >= 3:
            tks = list(tokenizer(argv[1]))
            for tk in tks:
                print tk
            print
            ast = parser(tks)
            print ast
def tokenizer_execute(language, page_html, link):
    obj = tokenizer.tokenizer(language)
    obj.generate_tokens()
    obj2 = tokenizer.semantic_tokenizer(obj.tokens)
    obj2.generate_tokens()
    obj3 = tokenizer.extractor(obj.tokens, obj2.semantic_tokens)
    if not page_html:
        returned_result = obj3.start_extract(link)
        return returned_result
    else:
        returned_result = obj3.start_extract_without_fetch(page_html)
        return returned_result
Example #21
0
def handle_mentions(body, say):
    raw_message = body["event"]["text"]
    tokenized_message = tokenizer.tokenizer(raw_message)
    tokenized_message_types = [x.type for x in tokenized_message]
    # print(tokenized_message)

    if tokenized_message_types == ["USERNAME", "QR", "URL"]:
        target_url = tokenized_message[2].value
        say(blocks=[{
            "type": "image",
            "title": {
                "type": "plain_text",
                "text": f"QR Code of {target_url}"
            },
            "block_id": f"image-{target_url}",
            "image_url": QR_BASE_URL + target_url,
            "alt_text": "QR Code"
        }])
    elif tokenized_message_types == ["USERNAME", "OMIKUJI"]:
        with open("omikuji_result.json") as f:
            omikuji_result = json.load(f)["omikuji"]
        chose_result = random.choice(omikuji_result)
        say(blocks=[{
            "type": "image",
            "title": {
                "type": "plain_text",
                "text": chose_result["text"]
            },
            "block_id": "image",
            "image_url": chose_result["image"],
            "alt_text": "Image " + chose_result["text"]
        }])
    elif tokenized_message_types == ["USERNAME", "GAKUSEKI", "STUDENT_ID"]:
        students = member_list.get_members()
        student_id = tokenized_message[2].value
        the_student = students.get(student_id)
        if the_student is None:
            say("Not found.")
        else:
            s_email = the_student["email"]
            s_real_name = the_student["real_name"]
            say(blocks=[{
                "type":
                "section",
                "fields": [{
                    "type": "mrkdwn",
                    "text": f":e-mail:*Email:*\n{s_email}",
                }, {
                    "type": "mrkdwn",
                    "text": f":pencil:*Real Name:*\n{s_real_name}",
                }]
            }])
Example #22
0
def process_file(fileName, mine_type):
    worker = tokenizer(fileName)
    
    while True:
        word = worker.nextWord()
        if word == None:
            break
        log('# ' + word)
        if mine_type == cool_mine:
            if knowndb.find(word) == False:
                unknowndb.add(word)
        else:
            knowndb.add(word)
Example #23
0
def calculator():
    while True:
        expr = get_input()
        tokens = tokenizer(expr)
        parentheses = []
        for tok in tokens:
            if tok == "(" or tok == ")":
                parentheses.append(tok)
        if not paren_check("".join(parentheses)):
            print("Unmatched parentheses")
        if tokens is not None or len(tokens) > 0:
            postfix = infixtopostfix(tokens)
            evaluation = postfixevaluation(postfix)
        print("press CTRL + C to quit, else press any key to continue")
Example #24
0
def search(search_query):
    """ Searches for normalized words in the search query in tokens table and 
    returns the doc name in the descending order of sum(freq) of query words"""
    keywords = "','".join(ts.tokenizer(search_query))
    db_conn = db.create_db_conn("MSSQL")
    tblname = f"{db.DBNAME}.{db.SCHEMA}.tokens"
    sql = f""\
    f"SELECT doc, sum(freq) "\
    f"FROM {tblname} "\
    f"where token in ('{keywords}') "\
    f"group by doc "\
    f"order by 2 desc"
    print(sql)
    df = pd.read_sql_query(sql, db_conn)
    return (df)
Example #25
0
def main(argv):
	print('Sedna Parser')
	if len(argv) < 2:
		print('command line syntax: <input-source> ...')
		return

	print('---start---')

	for inp in argv[1:]:
		fd = open(inp, 'rb')
		data = fd.read().decode('utf-8')
		fd.close()

		ast = tokenizer.tokenizer(data)
		print(json.dumps(ast))
		'''
Example #26
0
    def matches(self, term):
        """Given a term, returns a list of postings for that term and its
        synonyms.

        We find synonyms for the given term, obtain their postings, and merge
        them with the \"original\" list of postings for the term itself."""
        # Obtain the postings list for this term.
        term_postings = \
            set(self.compound_index.postings_list(self.INDEX, term))

        # Convert the postings list into a dictionary,
        # where keys are doc IDs and values are term counts.
        posting_dict = {}
        for posting in term_postings:
            posting_dict[posting[0]] = posting[1]

        # Find synonyms of the term from our thesaurus.
        thesaurus = Thesaurus()
        unstemmed = self.stemmed_unstemmed_map(self.INDEX)[term]
        synonyms = thesaurus[unstemmed]

        for synonym in synonyms:
            # Get the postings for this synonym.
            stemmed_synonym = tokenizer(synonym)[0]
            postings = self.compound_index.postings_list(
                self.INDEX, stemmed_synonym)

            # Update the existing postings with each synonym's count.
            for posting in postings:
                doc_id = posting[0]
                count = posting[1]

                if doc_id in posting_dict:
                    posting_dict[doc_id] += count
                else:
                    posting_dict[doc_id] = count

        # Convert the posting dictionary back into a posting list.
        combined_postings = []
        for doc_id, count in posting_dict.iteritems():
            combined_postings.append([doc_id, count])

        return sorted(combined_postings)
Example #27
0
def run(sq):
    url_list =[
        "https://isha.sadhguru.org/us/en/wisdom/article/what-to-eat-making-right-food-choices",
        "https://www.pythonforbeginners.com/basics/getting-user-input-from-the-keyboard",
        "https://medium.com/center-for-data-science/deepmind-fellow-profile-ksenia-saenko-e6d0f7574a59",
        "https://medium.com/center-for-data-science/deepmind-fellow-profile-yassine-kadiri-7bfe4a045050"
        ]
    data = inverse_index(hit_urls(url_list))

    # update main map with words from the html pages, with their occurrences
    MAIN_MAP.update(data)

    query = tokenizer(sq)

    root = Node()
    ignore = ['©', '—', '’', '“', '”', "''"]

    for word in MAIN_MAP:
        if word not in ignore:
            add(root, word)

    retval = {}

    # search the compressed trie using the find function
    for key in query:
        if find(root, key):
            retval.update({key: MAIN_MAP[key]})

    resulting_idx = ranking(retval)

    if not resulting_idx:
        print(f'\n No results for your search query - {sq}')
        print('\n  Modify the query and try again, listed below are the searched URLs')

        for idx, ul in enumerate(url_list):
            print(f'{idx+1}.{ul}')

        return

    print("\n Search results, in decreasing order of relevance \n")
    for idx, val in enumerate(resulting_idx):
        print(f'{idx+1}: {url_list[val]}')
Example #28
0
    def get_tokens_for(self, index, unstemmed=False):
        """Given an index (title or abstract), returns a list of tokens
        from the words contained in that index.

        By default, this will return case-folded and stemmed tokens. If
        the unstemmed argument is set to True, the original words will be
        returned instead."""
        raw_text = self.__text.get(index)

        if unstemmed:
            # We need to return just the words, without any punctuation.
            words = raw_text.split()
            stripped = [x.strip(string.punctuation) for x in words]
            return stripped

        tokens = tokenizer(raw_text)

        # We want to strip out tokens which consist of just
        # punctuation characters.
        return [x for x in tokens if x not in string.punctuation]
def solve(equation):

    expression = tokenizer(equation)
    left_side = right_side = None
    comp = None
    for i, token in enumerate(expression):
        if token.type == 'Comparator':
            left_side = build(expression[:i])
            right_side = build(expression[i + 1:])
            comp = token.value
            break
    if comp is None:
        expression = build(expression)
        print(f'The answer is {solve_side(expression)}')
    else:
        if comp == '=':
            comp = '=='
        result = eval(
            f'{solve_side(left_side)} {comp} {solve_side(right_side)}')
        print(f'The equation is {result}')
Example #30
0
def NBtesting(doc_string):

	current_file_path = os.path.dirname(os.path.abspath(__file__))

	# total document number
	# documentCount = len(testing_docs)

	# result is a dictionary, used to store the testing result.
	# i.e `result` = { doc_1: class_of_doc_1, doc_2: class_of_doc_2,...}
	result = dict()

	classes = ["1", "2", "3", "5"]

	# (new_V, prior, condprob)
	#	condprob[t][_class] term,class,prob

	f_v = open(current_file_path+"/training_result/v.txt", "r")
	v = f_v.read().decode("utf-8").split(",")
	
	f_prior = open(current_file_path+"/training_result/prior.txt", "r")
	prior = dict()
	for row in csv.DictReader(f_prior):
		prior[row["class"]]=float(row["prob"])


	f_condprob = open(current_file_path+"/training_result/condprob.txt", "r")
	condprob = dict()
	for row in csv.DictReader(f_condprob):
		term = row["term"].decode("utf-8")
		_class = row["class"]
		prob = float(row["prob"])
		
		if term not in condprob:
			condprob[term]=dict()
		condprob[term][_class] = prob

	doc_terms = tokenizer.tokenizer(doc_string)
	
	result = naive_bayes.ApplyMultinomialNB(classes, v, prior, condprob, doc_terms)

	return result
Example #31
0
    def parse_input(self):
        assign_token = []
        assign_value = []

        input_lines = input(">>: ")  # 3,4
        input_tokens = tokenizer(input_lines)

        for token in input_tokens:
            if not token[1] == "comma":
                if token[1] == 'bool':
                    val = (token[0].replace("\"", ""), token[1])
                    token = val
                assign_value.append(token)

        i = 0
        assign_token.append(self.current_token)
        assign_token.append(("=", "assignment"))
        assign_token.append(assign_value[i])
        assign_token.append(("INPUT", "INPUT"))
        self.keep("identifier")
        i += 1
        while self.current_token[1] == 'comma':
            self.keep("comma")
            assign_token.append(self.current_token)
            assign_token.append(("=", "assignment"))
            self.keep("identifier")
            if assign_value[i][1] != 'EOF':
                assign_token.append(assign_value[i])
                assign_token.append(("INPUT", "INPUT"))
            else:
                self.error("\nExpected more inputs")
            i += 1

        if (len(assign_value) - 1) != i:
            # input values is greater than identifiers
            self.error("\nExpected less inputs")

        for i in range(0, len(assign_token), 4):
            input_assign = Parser(assign_token[i:i + 4])
            input_assign.parse_assign()
Example #32
0
def batch_test():
    for jack_path, target_path in [
        [
            '../test/ArrayTest/Main.jack',
            '../test/engine_test/array_main_actual.xml'
        ],
        [
            '../test/Square/Main.jack',
            '../test/engine_test/square_main_actual.xml'
        ],
        [
            '../test/Square/Square.jack',
            '../test/engine_test/square_actual.xml'
        ],
        [
            '../test/Square/SquareGame.jack',
            '../test/engine_test/square_game_actual.xml'
        ],
        [
            '../test/ExpressionLessSquare/Main.jack',
            '../test/engine_test/exp_main_actual.xml'
        ],
        [
            '../test/ExpressionLessSquare/Square.jack',
            '../test/engine_test/exp_actual.xml'
        ],
        [
            '../test/ExpressionLessSquare/SquareGame.jack',
            '../test/engine_test/exp_game_actual.xml'
        ]
    ]:
        print(jack_path)
        print(target_path)
        with open(jack_path, 'r') as f:
            content = f.read()
        tokens = tokenizer(content)
        CompilationEngine(tokens).treeToXml(target_path)
Example #33
0
import unicodedata
import nltk
import enchant
import os

import tokenizer
import metaphone
import plausibleWords
import dictionarySearch
import getngrams

splitSentencesArray = tokenizer.tokenizer()
incorrectlySpelled = dictionarySearch.dictionarySearch(splitSentencesArray)
correctlySpelled = []
summationArray = []
outputSentencesArray = splitSentencesArray
tempSum = 0.0

for i in xrange(len(incorrectlySpelled)):
	plausibleList = plausibleWords.plausibleWords(splitSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]])
	for j in xrange(len(plausibleList)):
		if incorrectlySpelled[i][1] != 0 and incorrectlySpelled[i][1] != (len(splitSentencesArray[incorrectlySpelled[i][0]]) - 1):
			testString1 = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + plausibleList[j] + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
			testString = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + " " + plausibleList[j] + " " + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
		elif incorrectlySpelled[i][1] == 0:
			testString1 = plausibleList[j] + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
			testString = plausibleList[j] + " " + outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]+1]
		else:
			testString1 = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + plausibleList[j]
			testString = outputSentencesArray[incorrectlySpelled[i][0]][incorrectlySpelled[i][1]-1] + " " + plausibleList[j]
		os.system("getngrams.py " + testString + " -noprint -quit")
Example #34
0
def preProcess(texto, linguagem):
    return stemming.stemmer(
        removalStopwords.removalStopwords(tokenizer.tokenizer(texto),
                                          linguagem), linguagem)
Example #35
0
def _is_jack_file(file):
    return file.split('.')[1] == 'jack'


def _get_jack_files(arg):
    if _is_file(arg):
        files = [arg]
    else:
        content_in_directory = \
            ['{}/'.format(arg) + content for content in os.listdir(arg)]
        files = [file for file in content_in_directory if _is_file(file)]

    return (file for file in files if _is_jack_file(file))


def _get_output_filename(arg):
    return '{}.vm'.format(arg.split('.jack')[0])


##################--------- DRIVER CODE  -------######################  # NOQA

if __name__ != '__main__':
    print 'Please run as a self-conatined program'

jack_files = _get_jack_files(sys.argv[1])

for file in jack_files:
    token_gen = tokenizer(file)
    output_file = open(_get_output_filename(file), 'w')
    compile_file(output_file, token_gen)
Example #36
0
    def pass0(self):

        #read input file and parse it (also invoke preprocesor)
        self.t = tokenizer.tokenizer()
        self.t.parse(self.mainFileName)
Example #37
0
        return "-"

def binary_op():
    #print "Parsing bin op with %s" % sym
    if sym.type == "+":
        return "+"
    elif sym.type == "-":
        return "-"
    elif sym.type == "*":
        return "*"
    elif sym.type == "/":
        return "/"

if __name__ == '__main__':
    import sys

    line = sys.stdin.read()
    t = tokenizer()(line)

    def f():
        global sym
        try:
            sym = map_sym(t.next())
            #print "Sym = %s" % sym
        except StopIteration:
            return
    getsym = f

    getsym()
    print expect(expr())
Example #38
0
elif args.dataset == 'covid':
    folder_name = 'covid/'
    pos_weight = torch.tensor((290726 - 405) / 290726, dtype=torch.float32)
    # pos_weight = torch.tensor(1.0, dtype=torch.float32)
    valid = readData(folder_name + 'dev.csv')
    test = readData(folder_name + 'test.csv')
    train = readData(folder_name + 'train.csv')
elif args.dataset == 'for_submission':
    folder_name = 'for_submission/'
    pos_weight = torch.tensor((2335 - 120) / 120, dtype=torch.float32)
    # pos_weight = torch.tensor(1.0, dtype=torch.float32)
    valid = readData(folder_name + 'dev.csv')
    test = readData(folder_name + 'train.csv')
    train = readData(folder_name + 'train.csv')

tk = tokenizer(train + valid + test)

# def make_batch(data):
#     X = []
#     Y = []
#     lengths = []
#     weights = []
#     for d in data:
#         x, x_len = tk.tokenize(d[0])
#         y = d[1]
#         Y.append(float(y))
#         # X.append(strToLong(x, char2int, max_length))
#         X.append(x)
#         lengths.append(x_len)
#     X = np.stack(X, axis=0)
#     Y = np.array(Y)
        # Add the next word to the sequence
        sequence += "{} ".format(next_state.split()[0])
        curr_state = next_state
        lower_bound = 0.0
        l += 1
        # Ensures the sentence generated ends appropriately
        if l >= k and next_state[len(next_state) - 1] is '.':
            end = True
    return sequence
if __name__ == '__main__':
    # Command line args conditionals
    if len(sys.argv) >= 3:
        filename = sys.argv[1]
        N = int(sys.argv[2])
    elif len(sys.argv) == 2:
        filename = sys.argv[1]
        N = 1
    else:
        filename = "markov_test_1.txt"
        N = 1
    begin_state = ""
    directory = "text_files/" + filename
    # Deque object used to store n-token states
    tokens = tokenizer.tokenizer(directory)
    markovchain, begin_state = create_map(tokens, N)
    #pairs = markovchain.get_pairs()
    #for i in range(50):
        #print("{} contains {}".format(pairs[i][0], pairs[i][1]))
    generated_string = generate_sequence(markovchain, None, 25)
    print(generated_string)
Example #40
0
    def test_reveal_offset(self):
        e1 = "to sentence here 12:30 3h 15min tags:tag1,tag3 , tag4"

        tok = tokenizer(e1)

        self.assertEqual(tok.reveal_next_token(1), ("WORD", "sentence"))
Example #41
0
                i,
                j[0],
                k[0],
            ))
            for x in range(1, l):
                print('%20s    %20s   %20s' % (
                    "",
                    "",
                    k[x],
                ))


if __name__ == "__main__":
    tokens = tokenizer.tokens
    table = {k: [0, set()] for k in tokens}
    lexer = lex.lex(module=tokenizer())
    filename = sys.argv[1]
    if os.path.exists(filename):
        file = open(filename, 'r')
        data = file.read()
        lexer.input(data)
        while True:
            tokk = lexer.token()
            if not tokk: break
            table[tokk.type][0] += 1
            table[tokk.type][1].add(tokk.value)
        file.close()
    else:
        print("File Does Not Exist")
    Print()
Example #42
0
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
import pandas as pd
import json

import tokenizer

import dummytokenize

import pickle

t = tokenizer.tokenizer()


class create_CNN():
    def __init__(self, X, Y, config, tokenizer=None):
        self.config_dict = json.load(open(config))
        self.lb = LabelEncoder()
        self.tfidf = TfidfVectorizer(tokenizer=tokenizer,
                                     preprocessor=tokenizer,
                                     token_pattern=None)

        self.x = X
        self.y = Y

    def fit_tfidf(self):
        print(self.x)
Example #43
0
        logging.basicConfig(format='[%(asctime)s.%(msecs)03d] %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S', level=numeric_level)
        logging.info('Created Logger level={}'.format(loglevel))
    else:
        logging.basicConfig(filename=logfile, format='[%(asctime)s.%(msecs)03d] %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S', level=numeric_level)
        logging.info('Created Logger level={} file={}'.format(loglevel, logfile))

        
################################################
### MAIN #######################################
################################################

if __name__ == '__main__':
    fin = None
    fout = None
    num_threads = 1
    t = tokenizer()

    usage = """usage: {} [-i FILE -o FILE -num_threads INT]  [tok_options]
    -i: (stdin)
    -o: (stdout)
    -num_threads: 1 (used only when -i and -o are used)
    -h: this message

  tok_options (See https://github.com/OpenNMT/Tokenizer for more details):
""".format(sys.argv.pop(0),t.tokopts)
    for k,v in t.tokopts.items():
        usage += "    -{}: {}\n".format(k,v)

    sys.argv = t.updateOpts(sys.argv)        
    while len(sys.argv):
        tok = sys.argv.pop(0)
def _is_jack_file(file):
    return file.split('.')[1] == 'jack'


def _get_jack_files(arg):
    if _is_file(arg):
        files = [arg]
    else:
        content_in_directory = \
            ['{}/'.format(arg) + content for content in os.listdir(arg)]
        files = [file for file in content_in_directory if _is_file(file)]

    return (file for file in files if _is_jack_file(file))


def _get_output_filename(arg):
    return '{}_output.xml'.format(arg.split('.jack')[0])

##################--------- DRIVER CODE  -------######################  # NOQA

if __name__ != '__main__':
    print 'Please run as a self-conatined program'

jack_files = _get_jack_files(sys.argv[1])

for file in jack_files:
    token_gen = tokenizer(file)
    output_file = open(_get_output_filename(file), 'w')
    compile_file(output_file, token_gen)
Example #45
0
 stateOutput = open('states'+args.output+'.csv','w')
 inputcsv = csv.reader(csvInput)
 citycsv = csv.writer(cityOutput)
 statecsv = csv.writer(stateOutput)
 cityDict = dict()
 stateDict = dict()
 cityMeanDict = dict()
 stateMeanDict = dict()
 for row in inputcsv:
     if args.output == 'Yelp':
         [sentiment,city,state] = row
         if state not in states:
             continue
     elif args.output == 'Zagat':
         [sentiment,state,city] = row
     city = ' '.join(tokenizer(city))
     city = city+':'+state
     sentiment = float(sentiment)
     if city in cityDict:
         crrnt = cityDict[city]
         cityDict[city] = [crrnt[0]+sentiment,crrnt[1]+1]
     else:
         cityDict[city] = [sentiment,1]
     if state in stateDict:
         crrnt = stateDict[state]
         stateDict[state] = [crrnt[0]+sentiment,crrnt[1]+1.0]
     else:
         stateDict[state] = [sentiment,1.0]
 for k in cityDict:
     v = cityDict[k]
     mean = v[0]/v[1]